From 031a0300f2c94e81598bcc08a9e6de7f10a18d7b Mon Sep 17 00:00:00 2001 From: "Fabio M. De Francesco" Date: Thu, 1 Sep 2022 18:07:04 +0200 Subject: [PATCH 001/707] ecryptfs: Replace kmap() with kmap_local_page() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The use of kmap() is being deprecated in favor of kmap_local_page(). There are two main problems with kmap(): (1) It comes with an overhead as the mapping space is restricted and protected by a global lock for synchronization and (2) it also requires global TLB invalidation when the kmap’s pool wraps and it might block when the mapping space is fully utilized until a slot becomes available. With kmap_local_page() the mappings are per thread, CPU local, can take page faults, and can be called from any context (including interrupts). It is faster than kmap() in kernels with HIGHMEM enabled. Furthermore, the tasks can be preempted and, when they are scheduled to run again, the kernel virtual addresses are restored and still valid. Since its use in fs/ecryptfs is safe everywhere, it should be preferred. Therefore, replace kmap() with kmap_local_page() in fs/ecryptfs. Cc: "Venkataramanan, Anirudh" Suggested-by: Ira Weiny Reviewed-by: Ira Weiny Signed-off-by: Fabio M. De Francesco Signed-off-by: Tyler Hicks Link: https://lore.kernel.org/r/20220901160704.25701-1-fmdefrancesco@gmail.com --- fs/ecryptfs/crypto.c | 8 ++++---- fs/ecryptfs/read_write.c | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index e3f5d7f3c8a0ad..03263ebcccc6bd 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -465,10 +465,10 @@ int ecryptfs_encrypt_page(struct page *page) } lower_offset = lower_offset_for_page(crypt_stat, page); - enc_extent_virt = kmap(enc_extent_page); + enc_extent_virt = kmap_local_page(enc_extent_page); rc = ecryptfs_write_lower(ecryptfs_inode, enc_extent_virt, lower_offset, PAGE_SIZE); - kunmap(enc_extent_page); + kunmap_local(enc_extent_virt); if (rc < 0) { ecryptfs_printk(KERN_ERR, "Error attempting to write lower page; rc = [%d]\n", @@ -514,10 +514,10 @@ int ecryptfs_decrypt_page(struct page *page) BUG_ON(!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)); lower_offset = lower_offset_for_page(crypt_stat, page); - page_virt = kmap(page); + page_virt = kmap_local_page(page); rc = ecryptfs_read_lower(page_virt, lower_offset, PAGE_SIZE, ecryptfs_inode); - kunmap(page); + kunmap_local(page_virt); if (rc < 0) { ecryptfs_printk(KERN_ERR, "Error attempting to read lower page; rc = [%d]\n", diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c index 60bdcaddcbe57e..5edf027c835906 100644 --- a/fs/ecryptfs/read_write.c +++ b/fs/ecryptfs/read_write.c @@ -64,11 +64,11 @@ int ecryptfs_write_lower_page_segment(struct inode *ecryptfs_inode, offset = ((((loff_t)page_for_lower->index) << PAGE_SHIFT) + offset_in_page); - virt = kmap(page_for_lower); + virt = kmap_local_page(page_for_lower); rc = ecryptfs_write_lower(ecryptfs_inode, virt, offset, size); if (rc > 0) rc = 0; - kunmap(page_for_lower); + kunmap_local(virt); return rc; } @@ -253,11 +253,11 @@ int ecryptfs_read_lower_page_segment(struct page *page_for_ecryptfs, int rc; offset = ((((loff_t)page_index) << PAGE_SHIFT) + offset_in_page); - virt = kmap(page_for_ecryptfs); + virt = kmap_local_page(page_for_ecryptfs); rc = ecryptfs_read_lower(virt, offset, size, ecryptfs_inode); if (rc > 0) rc = 0; - kunmap(page_for_ecryptfs); + kunmap_local(virt); flush_dcache_page(page_for_ecryptfs); return rc; } From c1cc2db216078f9b1e29c991b1b9177c26757162 Mon Sep 17 00:00:00 2001 From: Slark Xiao Date: Fri, 22 Jul 2022 18:02:12 +0800 Subject: [PATCH 002/707] ecryptfs: keystore: Fix typo 'the the' in comment Replace 'the the' with 'the' in the comment. Signed-off-by: Slark Xiao Signed-off-by: Tyler Hicks Link: https://lore.kernel.org/r/20220722100212.79490-1-slark_xiao@163.com --- fs/ecryptfs/keystore.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c index 3fe41964c0d8d9..2452d6fd7062d7 100644 --- a/fs/ecryptfs/keystore.c +++ b/fs/ecryptfs/keystore.c @@ -878,7 +878,7 @@ struct ecryptfs_parse_tag_70_packet_silly_stack { * @filename: This function kmalloc's the memory for the filename * @filename_size: This function sets this to the amount of memory * kmalloc'd for the filename - * @packet_size: This function sets this to the the number of octets + * @packet_size: This function sets this to the number of octets * in the packet parsed * @mount_crypt_stat: The mount-wide cryptographic context * @data: The memory location containing the start of the tag 70 From a3d78fe3e1ae8c6a1901635c54a1a799656f72c8 Mon Sep 17 00:00:00 2001 From: Zipeng Zhang Date: Mon, 20 Mar 2023 10:04:28 +0800 Subject: [PATCH 003/707] fs: ecryptfs: comment typo fix Comment typo fix "vitual" -> "virtual". Signed-off-by: Zipeng Zhang Signed-off-by: Tyler Hicks --- fs/ecryptfs/crypto.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index 03263ebcccc6bd..c64985bf8c9e34 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -1313,7 +1313,7 @@ static int ecryptfs_read_headers_virt(char *page_virt, /** * ecryptfs_read_xattr_region - * @page_virt: The vitual address into which to read the xattr data + * @page_virt: The virtual address into which to read the xattr data * @ecryptfs_inode: The eCryptfs inode * * Attempts to read the crypto metadata from the extended attribute From 7c1b1906229db88c487e21e1ecb622db64a1830d Mon Sep 17 00:00:00 2001 From: Han Xu Date: Wed, 8 Nov 2023 09:07:01 -0600 Subject: [PATCH 004/707] mtd: spinand: gigadevice: Fix the get ecc status issue Some GigaDevice ecc_get_status functions use on-stack buffer for spi_mem_op causes spi_mem_check_op failing, fix the issue by using spinand scratchbuf. Fixes: c40c7a990a46 ("mtd: spinand: Add support for GigaDevice GD5F1GQ4UExxG") Signed-off-by: Han Xu Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/20231108150701.593912-1-han.xu@nxp.com --- drivers/mtd/nand/spi/gigadevice.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/mtd/nand/spi/gigadevice.c b/drivers/mtd/nand/spi/gigadevice.c index 987710e09441ad..6023cba748bb85 100644 --- a/drivers/mtd/nand/spi/gigadevice.c +++ b/drivers/mtd/nand/spi/gigadevice.c @@ -186,7 +186,7 @@ static int gd5fxgq4uexxg_ecc_get_status(struct spinand_device *spinand, { u8 status2; struct spi_mem_op op = SPINAND_GET_FEATURE_OP(GD5FXGQXXEXXG_REG_STATUS2, - &status2); + spinand->scratchbuf); int ret; switch (status & STATUS_ECC_MASK) { @@ -207,6 +207,7 @@ static int gd5fxgq4uexxg_ecc_get_status(struct spinand_device *spinand, * report the maximum of 4 in this case */ /* bits sorted this way (3...0): ECCS1,ECCS0,ECCSE1,ECCSE0 */ + status2 = *(spinand->scratchbuf); return ((status & STATUS_ECC_MASK) >> 2) | ((status2 & STATUS_ECC_MASK) >> 4); @@ -228,7 +229,7 @@ static int gd5fxgq5xexxg_ecc_get_status(struct spinand_device *spinand, { u8 status2; struct spi_mem_op op = SPINAND_GET_FEATURE_OP(GD5FXGQXXEXXG_REG_STATUS2, - &status2); + spinand->scratchbuf); int ret; switch (status & STATUS_ECC_MASK) { @@ -248,6 +249,7 @@ static int gd5fxgq5xexxg_ecc_get_status(struct spinand_device *spinand, * 1 ... 4 bits are flipped (and corrected) */ /* bits sorted this way (1...0): ECCSE1, ECCSE0 */ + status2 = *(spinand->scratchbuf); return ((status2 & STATUS_ECC_MASK) >> 4) + 1; case STATUS_ECC_UNCOR_ERROR: From 564eac2860bdbe6ac651e6909ac07ecd93d778f3 Mon Sep 17 00:00:00 2001 From: Peter Martincic Date: Mon, 27 Nov 2023 13:35:24 -0800 Subject: [PATCH 005/707] hv_utils: Allow implicit ICTIMESYNCFLAG_SYNC Hyper-V hosts can omit the _SYNC flag to due a bug on resume from modern suspend. In such a case, the guest may fail to update its time-of-day to account for the period when it was suspended, and could proceed with a significantly wrong time-of-day. In such a case when the guest is significantly behind, fix it by treating a _SAMPLE the same as if _SYNC was received so that the guest time-of-day is updated. This is hidden behind param hv_utils.timesync_implicit. Signed-off-by: Peter Martincic Acked-by: Boqun Feng Link: https://lore.kernel.org/r/20231127213524.52783-1-pmartincic@linux.microsoft.com Signed-off-by: Wei Liu Message-ID: <20231127213524.52783-1-pmartincic@linux.microsoft.com> --- drivers/hv/hv_util.c | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/drivers/hv/hv_util.c b/drivers/hv/hv_util.c index 42aec2c5606af7..9c97c4065fe736 100644 --- a/drivers/hv/hv_util.c +++ b/drivers/hv/hv_util.c @@ -296,6 +296,11 @@ static struct { spinlock_t lock; } host_ts; +static bool timesync_implicit; + +module_param(timesync_implicit, bool, 0644); +MODULE_PARM_DESC(timesync_implicit, "If set treat SAMPLE as SYNC when clock is behind"); + static inline u64 reftime_to_ns(u64 reftime) { return (reftime - WLTIMEDELTA) * 100; @@ -344,6 +349,29 @@ static void hv_set_host_time(struct work_struct *work) do_settimeofday64(&ts); } +/* + * Due to a bug on Hyper-V hosts, the sync flag may not always be sent on resume. + * Force a sync if the guest is behind. + */ +static inline bool hv_implicit_sync(u64 host_time) +{ + struct timespec64 new_ts; + struct timespec64 threshold_ts; + + new_ts = ns_to_timespec64(reftime_to_ns(host_time)); + ktime_get_real_ts64(&threshold_ts); + + threshold_ts.tv_sec += 5; + + /* + * If guest behind the host by 5 or more seconds. + */ + if (timespec64_compare(&new_ts, &threshold_ts) >= 0) + return true; + + return false; +} + /* * Synchronize time with host after reboot, restore, etc. * @@ -384,7 +412,8 @@ static inline void adj_guesttime(u64 hosttime, u64 reftime, u8 adj_flags) spin_unlock_irqrestore(&host_ts.lock, flags); /* Schedule work to do do_settimeofday64() */ - if (adj_flags & ICTIMESYNCFLAG_SYNC) + if ((adj_flags & ICTIMESYNCFLAG_SYNC) || + (timesync_implicit && hv_implicit_sync(host_ts.host_time))) schedule_work(&adj_time_work); } From 96c4f072b2ed4beaed7b001c9eb1a4d997ff3a22 Mon Sep 17 00:00:00 2001 From: Zev Weiss Date: Mon, 20 Nov 2023 04:19:56 -0800 Subject: [PATCH 006/707] dt-bindings: arm: aspeed: document ASRock SPC621D8HM3 Document ASRock SPC621D8HM3 board compatible. Signed-off-by: Zev Weiss Acked-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20231120121954.19926-5-zev@bewilderbeest.net Signed-off-by: Joel Stanley --- Documentation/devicetree/bindings/arm/aspeed/aspeed.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/arm/aspeed/aspeed.yaml b/Documentation/devicetree/bindings/arm/aspeed/aspeed.yaml index 749ee54a3ff83a..f8f66821cb5faa 100644 --- a/Documentation/devicetree/bindings/arm/aspeed/aspeed.yaml +++ b/Documentation/devicetree/bindings/arm/aspeed/aspeed.yaml @@ -36,6 +36,7 @@ properties: - aspeed,ast2500-evb - asrock,e3c246d4i-bmc - asrock,romed8hm3-bmc + - asrock,spc621d8hm3-bmc - bytedance,g220a-bmc - facebook,cmm-bmc - facebook,minipack-bmc From 2e09eb0615f012fb0d967e864d18b121b8ed2ae4 Mon Sep 17 00:00:00 2001 From: Zev Weiss Date: Mon, 20 Nov 2023 04:19:57 -0800 Subject: [PATCH 007/707] ARM: dts: aspeed: Add ASRock SPC621D8HM3 BMC This is a Xeon board broadly similar (aside from CPU vendor) to the already-support romed8hm3 (half-width, single-socket, ast2500). It doesn't require anything terribly special for OpenBMC support, so this device-tree should provide everything necessary for basic functionality with it. Signed-off-by: Zev Weiss Link: https://lore.kernel.org/r/20231120121954.19926-6-zev@bewilderbeest.net Signed-off-by: Joel Stanley --- arch/arm/boot/dts/aspeed/Makefile | 1 + .../aspeed/aspeed-bmc-asrock-spc621d8hm3.dts | 324 ++++++++++++++++++ 2 files changed, 325 insertions(+) create mode 100644 arch/arm/boot/dts/aspeed/aspeed-bmc-asrock-spc621d8hm3.dts diff --git a/arch/arm/boot/dts/aspeed/Makefile b/arch/arm/boot/dts/aspeed/Makefile index d3ac20e316d01e..2df0a2e88df712 100644 --- a/arch/arm/boot/dts/aspeed/Makefile +++ b/arch/arm/boot/dts/aspeed/Makefile @@ -10,6 +10,7 @@ dtb-$(CONFIG_ARCH_ASPEED) += \ aspeed-bmc-arm-stardragon4800-rep2.dtb \ aspeed-bmc-asrock-e3c246d4i.dtb \ aspeed-bmc-asrock-romed8hm3.dtb \ + aspeed-bmc-asrock-spc621d8hm3.dtb \ aspeed-bmc-bytedance-g220a.dtb \ aspeed-bmc-delta-ahe50dc.dtb \ aspeed-bmc-facebook-bletchley.dtb \ diff --git a/arch/arm/boot/dts/aspeed/aspeed-bmc-asrock-spc621d8hm3.dts b/arch/arm/boot/dts/aspeed/aspeed-bmc-asrock-spc621d8hm3.dts new file mode 100644 index 00000000000000..555485871e7a7d --- /dev/null +++ b/arch/arm/boot/dts/aspeed/aspeed-bmc-asrock-spc621d8hm3.dts @@ -0,0 +1,324 @@ +// SPDX-License-Identifier: GPL-2.0+ +/dts-v1/; + +#include "aspeed-g5.dtsi" +#include +#include +#include +#include + +/{ + model = "ASRock SPC621D8HM3 BMC"; + compatible = "asrock,spc621d8hm3-bmc", "aspeed,ast2500"; + + aliases { + serial4 = &uart5; + + i2c20 = &i2c1mux0ch0; + i2c21 = &i2c1mux0ch1; + }; + + chosen { + stdout-path = &uart5; + }; + + memory@80000000 { + reg = <0x80000000 0x20000000>; + }; + + leds { + compatible = "gpio-leds"; + + /* BMC heartbeat */ + led-0 { + gpios = <&gpio ASPEED_GPIO(H, 6) GPIO_ACTIVE_LOW>; + function = LED_FUNCTION_HEARTBEAT; + color = ; + linux,default-trigger = "timer"; + }; + + /* system fault */ + led-1 { + gpios = <&gpio ASPEED_GPIO(Z, 2) GPIO_ACTIVE_LOW>; + function = LED_FUNCTION_FAULT; + color = ; + panic-indicator; + }; + }; + + iio-hwmon { + compatible = "iio-hwmon"; + io-channels = <&adc 0>, <&adc 1>, <&adc 2>, <&adc 3>, + <&adc 4>, <&adc 5>, <&adc 6>, <&adc 7>, + <&adc 8>, <&adc 9>, <&adc 10>, <&adc 11>, + <&adc 12>, <&adc 13>, <&adc 14>, <&adc 15>; + }; +}; + +&fmc { + status = "okay"; + flash@0 { + status = "okay"; + m25p,fast-read; + label = "bmc"; + spi-max-frequency = <50000000>; /* 50 MHz */ +#include "openbmc-flash-layout-64.dtsi" + }; +}; + +&uart5 { + status = "okay"; +}; + +&vuart { + status = "okay"; + aspeed,lpc-io-reg = <0x2f8>; + aspeed,lpc-interrupts = <3 IRQ_TYPE_LEVEL_HIGH>; +}; + +&mac0 { + status = "okay"; + + pinctrl-names = "default"; + pinctrl-0 = <&pinctrl_rgmii1_default &pinctrl_mdio1_default>; + + nvmem-cells = <ð0_macaddress>; + nvmem-cell-names = "mac-address"; +}; + +&i2c0 { + status = "okay"; +}; + +&i2c1 { + status = "okay"; + + /* hardware monitor/thermal sensor */ + temperature-sensor@29 { + compatible = "nuvoton,nct7802"; + reg = <0x29>; + }; + + /* motherboard temp sensor (TMP1, near BMC) */ + temperature-sensor@4c { + compatible = "nuvoton,w83773g"; + reg = <0x4c>; + }; + + /* motherboard FRU eeprom */ + eeprom@50 { + compatible = "st,24c128", "atmel,24c128"; + reg = <0x50>; + pagesize = <16>; + #address-cells = <1>; + #size-cells = <1>; + + eth0_macaddress: macaddress@3f80 { + reg = <0x3f80 6>; + }; + }; + + /* M.2 slot smbus mux */ + i2c-mux@71 { + compatible = "nxp,pca9545"; + reg = <0x71>; + #address-cells = <1>; + #size-cells = <0>; + + i2c1mux0ch0: i2c@0 { + #address-cells = <1>; + #size-cells = <0>; + reg = <0>; + }; + + i2c1mux0ch1: i2c@1 { + #address-cells = <1>; + #size-cells = <0>; + reg = <1>; + }; + }; +}; + +&i2c2 { + status = "okay"; +}; + +&i2c3 { + status = "okay"; +}; + +&i2c4 { + status = "okay"; +}; + +&i2c5 { + status = "okay"; +}; + +&i2c6 { + status = "okay"; +}; + +&i2c7 { + status = "okay"; +}; + +&i2c8 { + status = "okay"; +}; + +&i2c9 { + status = "okay"; +}; + +&i2c10 { + status = "okay"; +}; + +&i2c11 { + status = "okay"; +}; + +&i2c12 { + status = "okay"; +}; + +&i2c13 { + status = "okay"; +}; + +&video { + status = "okay"; +}; + +&vhub { + status = "okay"; +}; + +&lpc_ctrl { + status = "okay"; +}; + +&lpc_snoop { + status = "okay"; + snoop-ports = <0x80>; +}; + +&kcs3 { + status = "okay"; + aspeed,lpc-io-reg = <0xca2>; +}; + +&peci0 { + status = "okay"; +}; + +&pwm_tacho { + status = "okay"; + pinctrl-names = "default"; + pinctrl-0 = <&pinctrl_pwm0_default + &pinctrl_pwm2_default + &pinctrl_pwm3_default + &pinctrl_pwm4_default>; + + fan@0 { + reg = <0x00>; + aspeed,fan-tach-ch = /bits/ 8 <0x00>; + }; + + fan@2 { + reg = <0x02>; + aspeed,fan-tach-ch = /bits/ 8 <0x02>; + }; + + fan@3 { + reg = <0x03>; + aspeed,fan-tach-ch = /bits/ 8 <0x03>; + }; + + fan@4 { + reg = <0x04>; + aspeed,fan-tach-ch = /bits/ 8 <0x04>; + }; +}; + +&gpio { + status = "okay"; + gpio-line-names = + /* A */ "LOCATORLED_STATUS_N", "LOCATORBTN_N", + "BMC_READY_N", "FM_SPD_DDRCPU_LVLSHFT_EN", + "", "", "", "", + /* B */ "NODE_ID_1", "NODE_ID_2", "PSU_FAN_FAIL_N", "", + "", "", "", "GPIO_RST", + /* C */ "", "", "", "", "", "", "", "", + /* D */ "FP_PWR_BTN_MUX_N", "FM_BMC_PWRBTN_OUT_N", + "FP_RST_BTN_N", "RST_BMC_RSTBTN_OUT_N", + "NMI_BTN_N", "BMC_NMI", + "", "", + /* E */ "", "", "", "FM_ME_RCVR_N", "", "", "", "", + /* F */ "BMC_SMB_SEL_N", "FM_CPU2_DISABLE_COD_N", + "FM_REMOTE_DEBUG_BMC_EN", "FM_CPU_ERR0_LVT3_EN", + "FM_CPU_ERR1_LVT3_EN", "FM_CPU_ERR2_LVT3_EN", + "FM_MEM_THERM_EVENT_CPU1_LVT3_N", "FM_MEM_THERM_EVENT_CPU2_LVT3_N", + /* G */ "HWM_BAT_EN", "", "BMC_PHYRST_N", "FM_BIOS_SPI_BMC_CTRL", + "BMC_ALERT1_N", "BMC_ALERT2_N", "BMC_ALERT3_N", "IRQ_SML0_ALERT_N", + /* H */ "BMC_SMB_PRESENT_1_N", "FM_PCH_CORE_VID_0", "FM_PCH_CORE_VID_1", "", + "FM_MFG_MODE", "BMC_RTCRST", "BMC_HB_LED_N", "BMC_CASEOPEN", + /* I */ "IRQ_PVDDQ_ABCD_CPU1_VRHOT_LVC3_N", "IRQ_PVDDQ_ABCD_CPU2_VRHOT_LVC3_N", + "IRQ_PVDDQ_EFGH_CPU1_VRHOT_LVC3_N", "IRQ_PVDDQ_EFGH_CPU2_VRHOT_LVC3_N", + "", "", "", "", + /* J */ "", "", "", "", "", "", "", "", + /* K */ "", "", "", "", "", "", "", "", + /* L */ "", "", "", "", "", "", "", "", + /* M */ "FM_PVCCIN_CPU1_PWR_IN_ALERT_N", "FM_PVCCIN_CPU2_PWR_IN_ALERT_N", + "IRQ_PVCCIN_CPU1_VRHOT_LVC3_N", "IRQ_PVCCIN_CPU2_VRHOT_LVC3_N", + "FM_CPU1_PROCHOT_BMC_LVC3_N", "", + "FM_CPU1_MEMHOT_OUT_N", "FM_CPU2_MEMHOT_OUT_N", + /* N */ "", "", "", "", "", "", "", "", + /* O */ "", "", "", "", "", "", "", "", + /* P */ "", "", "", "", "", "", "", "", + /* Q */ "", "", "", "", "", "", "RST_GLB_RST_WARN_N", "PCIE_WAKE_N", + /* R */ "", "", "FM_BMC_SUSACK_N", "FM_BMC_EUP_LOT6_N", + "", "FM_BMC_PCH_SCI_LPC_N", "", "", + /* S */ "FM_DBP_PRESENT_N", "FM_CPU2_SKTOCC_LCT3_N", + "FM_CPU1_FIVR_FAULT_LVT3", "FM_CPU2_FIVR_FAULT_LVT3", + "", "", "", "", + /* T */ "", "", "", "", "", "", "", "", + /* U */ "", "", "", "", "", "", "", "", + /* V */ "", "", "", "", "", "", "", "", + /* W */ "", "", "", "", "", "", "", "", + /* X */ "", "", "", "", "", "", "", "", + /* Y */ "FM_SLPS3_N", "FM_SLPS4_N", "", "FM_BMC_ONCTL_N_PLD", + "", "", "", "", + /* Z */ "FM_CPU_MSMI_CATERR_LVT3_N", "", "SYSTEM_FAULT_LED_N", "BMC_THROTTLE_N", + "", "", "", "", + /* AA */ "FM_CPU1_THERMTRIP_LATCH_LVT3_N", "FM_CPU2_THERMTRIP_LATCH_LVT3_N", + "FM_BIOS_POST_COMPLT_N", "DBP_BMC_SYSPWROK", + "", "IRQ_SML0_ALERT_MUX_N", + "IRQ_SMI_ACTIVE_N", "IRQ_NMI_EVENT_N", + /* AB */ "FM_PCH_BMC_THERMTRIP_N", "PWRGD_SYS_PWROK", + "ME_OVERRIDE", "IRQ_BMC_PCH_SMI_LPC_N", + "", "", "", "", + /* AC */ "", "", "", "", "", "", "", ""; +}; + +&adc { + status = "okay"; + pinctrl-names = "default"; + pinctrl-0 = <&pinctrl_adc0_default /* 3VSB */ + &pinctrl_adc1_default /* 5VSB */ + &pinctrl_adc2_default /* CPU1 */ + &pinctrl_adc3_default /* NC */ + &pinctrl_adc4_default /* VCCMABCD */ + &pinctrl_adc5_default /* VCCMEFGH */ + &pinctrl_adc6_default /* NC */ + &pinctrl_adc7_default /* NC */ + &pinctrl_adc8_default /* PVNN_PCH */ + &pinctrl_adc9_default /* 1P05PCH */ + &pinctrl_adc10_default /* 1P8PCH */ + &pinctrl_adc11_default /* BAT */ + &pinctrl_adc12_default /* 3V */ + &pinctrl_adc13_default /* 5V */ + &pinctrl_adc14_default /* 12V */ + &pinctrl_adc15_default>; /* GND */ +}; From 01bb8d5bf1ab1bd847a277f546e5f9af2c6933e1 Mon Sep 17 00:00:00 2001 From: Zev Weiss Date: Mon, 20 Nov 2023 04:19:03 -0800 Subject: [PATCH 008/707] dt-bindings: arm: aspeed: document ASRock E3C256D4I Document ASRock E3C256D4I board compatible. Signed-off-by: Zev Weiss Acked-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20231120121901.19817-5-zev@bewilderbeest.net Signed-off-by: Joel Stanley --- Documentation/devicetree/bindings/arm/aspeed/aspeed.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/arm/aspeed/aspeed.yaml b/Documentation/devicetree/bindings/arm/aspeed/aspeed.yaml index f8f66821cb5faa..6f7543463d894c 100644 --- a/Documentation/devicetree/bindings/arm/aspeed/aspeed.yaml +++ b/Documentation/devicetree/bindings/arm/aspeed/aspeed.yaml @@ -35,6 +35,7 @@ properties: - ampere,mtjade-bmc - aspeed,ast2500-evb - asrock,e3c246d4i-bmc + - asrock,e3c256d4i-bmc - asrock,romed8hm3-bmc - asrock,spc621d8hm3-bmc - bytedance,g220a-bmc From f957714c0f5353b151639654a62680d27cf53e44 Mon Sep 17 00:00:00 2001 From: Zev Weiss Date: Mon, 20 Nov 2023 04:19:04 -0800 Subject: [PATCH 009/707] ARM: dts: aspeed: Add ASRock E3C256D4I BMC Like the E3C246D4I, this is a reasonably affordable off-the-shelf mini-ITX AST2500/Xeon motherboard with good potential as an OpenBMC development platform. Booting the host requires a modicum of eSPI support that's not yet in the mainline kernel, but most other basic BMC functionality is available with this device-tree. Signed-off-by: Zev Weiss Link: https://lore.kernel.org/r/20231120121901.19817-6-zev@bewilderbeest.net Signed-off-by: Joel Stanley --- arch/arm/boot/dts/aspeed/Makefile | 1 + .../aspeed/aspeed-bmc-asrock-e3c256d4i.dts | 322 ++++++++++++++++++ 2 files changed, 323 insertions(+) create mode 100644 arch/arm/boot/dts/aspeed/aspeed-bmc-asrock-e3c256d4i.dts diff --git a/arch/arm/boot/dts/aspeed/Makefile b/arch/arm/boot/dts/aspeed/Makefile index 2df0a2e88df712..3e3e6b96cb799d 100644 --- a/arch/arm/boot/dts/aspeed/Makefile +++ b/arch/arm/boot/dts/aspeed/Makefile @@ -9,6 +9,7 @@ dtb-$(CONFIG_ARCH_ASPEED) += \ aspeed-bmc-ampere-mtmitchell.dtb \ aspeed-bmc-arm-stardragon4800-rep2.dtb \ aspeed-bmc-asrock-e3c246d4i.dtb \ + aspeed-bmc-asrock-e3c256d4i.dtb \ aspeed-bmc-asrock-romed8hm3.dtb \ aspeed-bmc-asrock-spc621d8hm3.dtb \ aspeed-bmc-bytedance-g220a.dtb \ diff --git a/arch/arm/boot/dts/aspeed/aspeed-bmc-asrock-e3c256d4i.dts b/arch/arm/boot/dts/aspeed/aspeed-bmc-asrock-e3c256d4i.dts new file mode 100644 index 00000000000000..263fcc8106ffaa --- /dev/null +++ b/arch/arm/boot/dts/aspeed/aspeed-bmc-asrock-e3c256d4i.dts @@ -0,0 +1,322 @@ +// SPDX-License-Identifier: GPL-2.0+ +/dts-v1/; + +#include "aspeed-g5.dtsi" +#include +#include +#include +#include +#include + +/{ + model = "ASRock E3C256D4I BMC"; + compatible = "asrock,e3c256d4i-bmc", "aspeed,ast2500"; + + aliases { + serial4 = &uart5; + + i2c20 = &i2c2mux0ch0; + i2c21 = &i2c2mux0ch1; + i2c22 = &i2c2mux0ch2; + i2c23 = &i2c2mux0ch3; + }; + + chosen { + stdout-path = &uart5; + }; + + memory@80000000 { + reg = <0x80000000 0x20000000>; + }; + + leds { + compatible = "gpio-leds"; + + /* BMC heartbeat */ + led-0 { + gpios = <&gpio ASPEED_GPIO(H, 6) GPIO_ACTIVE_LOW>; + function = LED_FUNCTION_HEARTBEAT; + color = ; + linux,default-trigger = "timer"; + }; + + /* system fault */ + led-1 { + gpios = <&gpio ASPEED_GPIO(Z, 2) GPIO_ACTIVE_LOW>; + function = LED_FUNCTION_FAULT; + color = ; + panic-indicator; + }; + }; + + iio-hwmon { + compatible = "iio-hwmon"; + io-channels = <&adc 0>, <&adc 1>, <&adc 2>, <&adc 3>, + <&adc 4>, <&adc 5>, <&adc 6>, <&adc 7>, + <&adc 8>, <&adc 9>, <&adc 10>, <&adc 11>, + <&adc 12>, <&adc 13>, <&adc 14>, <&adc 15>; + }; +}; + +&fmc { + status = "okay"; + flash@0 { + status = "okay"; + m25p,fast-read; + label = "bmc"; + spi-max-frequency = <100000000>; /* 100 MHz */ +#include "openbmc-flash-layout-64.dtsi" + }; +}; + +&uart1 { + status = "okay"; +}; + +&uart2 { + status = "okay"; +}; + +&uart3 { + status = "okay"; +}; + +&uart4 { + status = "okay"; +}; + +&uart5 { + status = "okay"; +}; + +&uart_routing { + status = "okay"; +}; + +&mac0 { + status = "okay"; + + pinctrl-names = "default"; + pinctrl-0 = <&pinctrl_rgmii1_default &pinctrl_mdio1_default>; + + nvmem-cells = <ð0_macaddress>; + nvmem-cell-names = "mac-address"; +}; + +&i2c0 { + status = "okay"; +}; + +&i2c1 { + status = "okay"; +}; + +&i2c2 { + status = "okay"; + + i2c-mux@70 { + compatible = "nxp,pca9545"; + reg = <0x70>; + #address-cells = <1>; + #size-cells = <0>; + + i2c2mux0ch0: i2c@0 { + #address-cells = <1>; + #size-cells = <0>; + reg = <0>; + }; + + i2c2mux0ch1: i2c@1 { + #address-cells = <1>; + #size-cells = <0>; + reg = <1>; + }; + + i2c2mux0ch2: i2c@2 { + #address-cells = <1>; + #size-cells = <0>; + reg = <2>; + }; + + i2c2mux0ch3: i2c@3 { + #address-cells = <1>; + #size-cells = <0>; + reg = <3>; + }; + }; +}; + +&i2c3 { + status = "okay"; +}; + +&i2c4 { + status = "okay"; +}; + +&i2c5 { + status = "okay"; +}; + +&i2c6 { + status = "okay"; +}; + +&i2c7 { + status = "okay"; +}; + +&i2c9 { + status = "okay"; +}; + +&i2c10 { + status = "okay"; +}; + +&i2c11 { + status = "okay"; + + vrm@60 { + compatible = "renesas,isl69269", "isl69269"; + reg = <0x60>; + }; +}; + +&i2c12 { + status = "okay"; + + /* FRU eeprom */ + eeprom@57 { + compatible = "st,24c128", "atmel,24c128"; + reg = <0x57>; + pagesize = <16>; + #address-cells = <1>; + #size-cells = <1>; + + eth0_macaddress: macaddress@3f80 { + reg = <0x3f80 6>; + }; + }; +}; + +&video { + status = "okay"; +}; + +&vhub { + status = "okay"; +}; + +&lpc_ctrl { + status = "okay"; +}; + +&lpc_snoop { + status = "okay"; + snoop-ports = <0x80>; +}; + +&kcs3 { + status = "okay"; + aspeed,lpc-io-reg = <0xca2>; +}; + +&peci0 { + status = "okay"; +}; + +&wdt1 { + aspeed,reset-mask = <(AST2500_WDT_RESET_DEFAULT & ~AST2500_WDT_RESET_LPC)>; +}; + +&wdt2 { + aspeed,reset-mask = <(AST2500_WDT_RESET_DEFAULT & ~AST2500_WDT_RESET_LPC)>; +}; + +&pwm_tacho { + status = "okay"; + pinctrl-names = "default"; + pinctrl-0 = <&pinctrl_pwm0_default /* CPU */ + &pinctrl_pwm2_default /* rear */ + &pinctrl_pwm4_default>; /* front */ + + /* CPU */ + fan@0 { + reg = <0x00>; + aspeed,fan-tach-ch = /bits/ 8 <0x00>; + }; + + /* rear */ + fan@2 { + reg = <0x02>; + aspeed,fan-tach-ch = /bits/ 8 <0x02>; + }; + + /* front */ + fan@4 { + reg = <0x04>; + aspeed,fan-tach-ch = /bits/ 8 <0x04>; + }; +}; + +&gpio { + status = "okay"; + gpio-line-names = + /* A */ "", "", "NMI_BTN_N", "BMC_NMI", "", "", "", "", + /* B */ "", "", "", "", "", "", "", "", + /* C */ "", "", "", "", "", "", "", "", + /* D */ "BMC_PSIN", "BMC_PSOUT", "BMC_RESETCON", "RESETCON", + "", "", "", "", + /* E */ "", "", "", "", "", "", "", "", + /* F */ "LOCATORLED_STATUS_N", "LOCATORBTN", "", "", + "", "", "BMC_PCH_SCI_LPC", "BMC_NCSI_MUX_CTL", + /* G */ "HWM_BAT_EN", "CHASSIS_ID0", "CHASSIS_ID1", "CHASSIS_ID2", + "", "", "", "", + /* H */ "FM_ME_RCVR_N", "O_PWROK", "", "D4_DIMM_EVENT_3V_N", + "MFG_MODE_N", "BMC_RTCRST", "BMC_HB_LED_N", "BMC_CASEOPEN", + /* I */ "", "", "", "", "", "", "", "", + /* J */ "BMC_READY", "BMC_PCH_BIOS_CS_N", "BMC_SMI", "", "", "", "", "", + /* K */ "", "", "", "", "", "", "", "", + /* L */ "", "", "", "", "", "", "", "", + /* M */ "", "", "", "", "", "", "", "", + /* N */ "", "", "", "", "", "", "", "", + /* O */ "", "", "", "", "", "", "", "", + /* P */ "", "", "", "", "", "", "", "", + /* Q */ "", "", "", "", "", "", "", "", + /* R */ "", "", "", "", "", "", "", "", + /* S */ "PCHHOT_BMC_N", "", "RSMRST", "", "", "", "", "", + /* T */ "", "", "", "", "", "", "", "", + /* U */ "", "", "", "", "", "", "", "", + /* V */ "", "", "", "", "", "", "", "", + /* W */ "", "", "", "", "", "", "", "", + /* X */ "", "", "", "", "", "", "", "", + /* Y */ "SLP_S3", "SLP_S5", "", "", "", "", "", "", + /* Z */ "CPU_CATERR_BMC_N", "", "SYSTEM_FAULT_LED_N", "BMC_THROTTLE_N", + "", "", "", "", + /* AA */ "CPU1_THERMTRIP_LATCH_N", "", "CPU1_PROCHOT_N", "", + "", "", "IRQ_SMI_ACTIVE_N", "FM_BIOS_POST_CMPLT_N", + /* AB */ "", "", "ME_OVERRIDE", "BMC_DMI_MODIFY", "", "", "", "", + /* AC */ "", "", "", "", "", "", "", ""; +}; + +&adc { + status = "okay"; + pinctrl-names = "default"; + pinctrl-0 = <&pinctrl_adc0_default /* 3VSB */ + &pinctrl_adc1_default /* 5VSB */ + &pinctrl_adc2_default /* CPU1 */ + &pinctrl_adc3_default /* VCCSA */ + &pinctrl_adc4_default /* VCCM */ + &pinctrl_adc5_default /* V10M */ + &pinctrl_adc6_default /* VCCIO */ + &pinctrl_adc7_default /* VCCGT */ + &pinctrl_adc8_default /* VPPM */ + &pinctrl_adc9_default /* BAT */ + &pinctrl_adc10_default /* 3V */ + &pinctrl_adc11_default /* 5V */ + &pinctrl_adc12_default /* 12V */ + &pinctrl_adc13_default /* GND */ + &pinctrl_adc14_default /* GND */ + &pinctrl_adc15_default>; /* GND */ +}; From eadd52a6233d4e50391eb68a7a77c24a8c262313 Mon Sep 17 00:00:00 2001 From: Renze Nicolai Date: Sat, 2 Dec 2023 01:38:44 +0100 Subject: [PATCH 010/707] dt-bindings: arm: aspeed: add Asrock X570D4U board Document Asrock X570D4U compatible. Signed-off-by: Renze Nicolai Acked-by: Krzysztof Kozlowski Reviewed-by: Joel Stanley Link: https://lore.kernel.org/r/20231202003908.3635695-2-renze@rnplus.nl Signed-off-by: Joel Stanley --- Documentation/devicetree/bindings/arm/aspeed/aspeed.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/arm/aspeed/aspeed.yaml b/Documentation/devicetree/bindings/arm/aspeed/aspeed.yaml index 6f7543463d894c..85e2c00a238477 100644 --- a/Documentation/devicetree/bindings/arm/aspeed/aspeed.yaml +++ b/Documentation/devicetree/bindings/arm/aspeed/aspeed.yaml @@ -38,6 +38,7 @@ properties: - asrock,e3c256d4i-bmc - asrock,romed8hm3-bmc - asrock,spc621d8hm3-bmc + - asrock,x570d4u-bmc - bytedance,g220a-bmc - facebook,cmm-bmc - facebook,minipack-bmc From ecab6c95f79bb6143090d0d48ee26501f28e0a59 Mon Sep 17 00:00:00 2001 From: Renze Nicolai Date: Sat, 2 Dec 2023 01:38:45 +0100 Subject: [PATCH 011/707] ARM: dts: aspeed: asrock: Add ASRock X570D4U BMC This is a relatively low-cost AST2500-based Amd Ryzen 5000 Series micro-ATX board that we hope can provide a decent platform for OpenBMC development. This initial device-tree provides the necessary configuration for basic BMC functionality such as serial console, KVM support and POST code snooping. Signed-off-by: Renze Nicolai Reviewed-by: Joel Stanley Link: https://lore.kernel.org/r/20231202003908.3635695-3-renze@rnplus.nl Signed-off-by: Joel Stanley --- arch/arm/boot/dts/aspeed/Makefile | 1 + .../dts/aspeed/aspeed-bmc-asrock-x570d4u.dts | 377 ++++++++++++++++++ 2 files changed, 378 insertions(+) create mode 100644 arch/arm/boot/dts/aspeed/aspeed-bmc-asrock-x570d4u.dts diff --git a/arch/arm/boot/dts/aspeed/Makefile b/arch/arm/boot/dts/aspeed/Makefile index 3e3e6b96cb799d..fb9cc95f1b60f3 100644 --- a/arch/arm/boot/dts/aspeed/Makefile +++ b/arch/arm/boot/dts/aspeed/Makefile @@ -12,6 +12,7 @@ dtb-$(CONFIG_ARCH_ASPEED) += \ aspeed-bmc-asrock-e3c256d4i.dtb \ aspeed-bmc-asrock-romed8hm3.dtb \ aspeed-bmc-asrock-spc621d8hm3.dtb \ + aspeed-bmc-asrock-x570d4u.dtb \ aspeed-bmc-bytedance-g220a.dtb \ aspeed-bmc-delta-ahe50dc.dtb \ aspeed-bmc-facebook-bletchley.dtb \ diff --git a/arch/arm/boot/dts/aspeed/aspeed-bmc-asrock-x570d4u.dts b/arch/arm/boot/dts/aspeed/aspeed-bmc-asrock-x570d4u.dts new file mode 100644 index 00000000000000..3c975bc41ae7de --- /dev/null +++ b/arch/arm/boot/dts/aspeed/aspeed-bmc-asrock-x570d4u.dts @@ -0,0 +1,377 @@ +// SPDX-License-Identifier: GPL-2.0+ +/dts-v1/; +#include "aspeed-g5.dtsi" +#include +#include + +/ { + model = "Asrock Rack X570D4U BMC"; + compatible = "asrock,x570d4u-bmc", "aspeed,ast2500"; + + aliases { + i2c40 = &i2c4mux0ch0; + i2c41 = &i2c4mux0ch1; + i2c42 = &i2c4mux0ch2; + i2c43 = &i2c4mux0ch3; + }; + + chosen { + stdout-path = &uart5; + }; + + memory@80000000 { + reg = <0x80000000 0x20000000>; + }; + + reserved-memory { + #address-cells = <1>; + #size-cells = <1>; + ranges; + + pci_memory: region@9a000000 { + no-map; + reg = <0x9a000000 0x00010000>; /* 64K */ + }; + + video_engine_memory: jpegbuffer { + size = <0x02800000>; /* 40M */ + alignment = <0x01000000>; + compatible = "shared-dma-pool"; + reusable; + }; + + gfx_memory: framebuffer { + size = <0x01000000>; + alignment = <0x01000000>; + compatible = "shared-dma-pool"; + reusable; + }; + }; + + leds { + compatible = "gpio-leds"; + + led-0 { + /* led-heartbeat-n */ + gpios = <&gpio ASPEED_GPIO(H, 6) GPIO_ACTIVE_LOW>; + color = ; + function = LED_FUNCTION_HEARTBEAT; + linux,default-trigger = "timer"; + }; + + led-1 { + /* led-fault-n */ + gpios = <&gpio ASPEED_GPIO(Z, 2) GPIO_ACTIVE_LOW>; + color = ; + function = LED_FUNCTION_FAULT; + panic-indicator; + }; + }; + + iio-hwmon { + compatible = "iio-hwmon"; + io-channels = <&adc 0>, <&adc 1>, <&adc 2>, <&adc 3>, <&adc 4>, + <&adc 5>, <&adc 6>, <&adc 7>, <&adc 8>, <&adc 9>, + <&adc 10>, <&adc 11>, <&adc 12>; + }; +}; + +&gpio { + status = "okay"; + gpio-line-names = + /*A0-A3*/ "status-locatorled-n", "", "button-nmi-n", "", + /*A4-A7*/ "", "", "", "", + /*B0-B3*/ "input-bios-post-cmplt-n", "", "", "", + /*B4-B7*/ "", "", "", "", + /*C0-C3*/ "", "", "", "", + /*C4-C7*/ "", "", "control-locatorbutton", "", + /*D0-D3*/ "button-power", "control-power", "button-reset", "control-reset", + /*D4-D7*/ "", "", "", "", + /*E0-E3*/ "", "", "", "", + /*E4-E7*/ "", "", "", "", + /*F0-F3*/ "", "", "", "", + /*F4-F7*/ "", "", "", "", + /*G0-G3*/ "output-rtc-battery-voltage-read-enable", "input-id0", "input-id1", "input-id2", + /*G4-G7*/ "input-alert1-n", "input-alert2-n", "input-alert3-n", "", + /*H0-H3*/ "", "", "", "", + /*H4-H7*/ "input-mfg", "", "led-heartbeat-n", "input-caseopen", + /*I0-I3*/ "", "", "", "", + /*I4-I7*/ "", "", "", "", + /*J0-J3*/ "output-bmc-ready", "", "", "", + /*J4-J7*/ "", "", "", "", + /*K0-K3*/ "", "", "", "", + /*K4-K7*/ "", "", "", "", + /*L0-L3*/ "", "", "", "", + /*L4-L7*/ "", "", "", "", + /*M0-M3*/ "", "", "", "", + /*M4-M7*/ "", "", "", "", + /*N0-N3*/ "", "", "", "", + /*N4-N7*/ "", "", "", "", + /*O0-O3*/ "", "", "", "", + /*O4-O7*/ "", "", "", "", + /*P0-P3*/ "", "", "", "", + /*P4-P7*/ "", "", "", "", + /*Q0-Q3*/ "", "", "", "", + /*Q4-Q7*/ "", "", "", "", + /*R0-R3*/ "", "", "", "", + /*R4-R7*/ "", "", "", "", + /*S0-S3*/ "input-bmc-pchhot-n", "", "", "", + /*S4-S7*/ "", "", "", "", + /*T0-T3*/ "", "", "", "", + /*T4-T7*/ "", "", "", "", + /*U0-U3*/ "", "", "", "", + /*U4-U7*/ "", "", "", "", + /*V0-V3*/ "", "", "", "", + /*V4-V7*/ "", "", "", "", + /*W0-W3*/ "", "", "", "", + /*W4-W7*/ "", "", "", "", + /*X0-X3*/ "", "", "", "", + /*X4-X7*/ "", "", "", "", + /*Y0-Y3*/ "", "", "", "", + /*Y4-Y7*/ "", "", "", "", + /*Z0-Z3*/ "", "", "led-fault-n", "output-bmc-throttle-n", + /*Z4-Z7*/ "", "", "", "", + /*AA0-AA3*/ "input-cpu1-thermtrip-latch-n", "", "input-cpu1-prochot-n", "", + /*AA4-AC7*/ "", "", "", "", + /*AB0-AB3*/ "", "", "", "", + /*AB4-AC7*/ "", "", "", "", + /*AC0-AC3*/ "", "", "", "", + /*AC4-AC7*/ "", "", "", ""; +}; + +&fmc { + status = "okay"; + flash@0 { + status = "okay"; + label = "bmc"; + m25p,fast-read; + spi-max-frequency = <10000000>; +#include "openbmc-flash-layout-64.dtsi" + }; +}; + +&uart5 { + status = "okay"; +}; + +&vuart { + status = "okay"; +}; + +&mac0 { + status = "okay"; + pinctrl-names = "default"; + pinctrl-0 = <&pinctrl_rgmii1_default &pinctrl_mdio1_default>; + + nvmem-cells = <ð0_macaddress>; + nvmem-cell-names = "mac-address"; +}; + +&mac1 { + status = "okay"; + pinctrl-names = "default"; + pinctrl-0 = <&pinctrl_rmii2_default &pinctrl_mdio2_default>; + use-ncsi; + + nvmem-cells = <ð1_macaddress>; + nvmem-cell-names = "mac-address"; +}; + +&i2c0 { + /* SMBus on auxiliary panel header (AUX_PANEL1) */ + status = "okay"; +}; + +&i2c1 { + status = "okay"; + + w83773g@4c { + compatible = "nuvoton,w83773g"; + reg = <0x4c>; + }; +}; + +&i2c2 { + /* PSU SMBus (PSU_SMB1) */ + status = "okay"; +}; + +&i2c3 { + status = "okay"; +}; + +&i2c4 { + status = "okay"; + + i2c-mux@70 { + compatible = "nxp,pca9545"; + reg = <0x70>; + #address-cells = <1>; + #size-cells = <0>; + + i2c4mux0ch0: i2c@0 { + /* SMBus on PCI express 16x slot */ + #address-cells = <1>; + #size-cells = <0>; + reg = <0>; + }; + + i2c4mux0ch1: i2c@1 { + /* SMBus on PCI express 8x slot */ + #address-cells = <1>; + #size-cells = <0>; + reg = <1>; + }; + + i2c4mux0ch2: i2c@2 { + /* Unknown */ + #address-cells = <1>; + #size-cells = <0>; + reg = <2>; + }; + + i2c4mux0ch3: i2c@3 { + /* SMBus on PCI express 1x slot */ + #address-cells = <1>; + #size-cells = <0>; + reg = <3>; + }; + }; +}; + +&i2c5 { + status = "okay"; +}; + +&i2c7 { + /* FRU and SPD EEPROM SMBus */ + status = "okay"; + + eeprom@57 { + compatible = "st,24c128", "atmel,24c128"; + reg = <0x57>; + pagesize = <16>; + #address-cells = <1>; + #size-cells = <1>; + + eth0_macaddress: macaddress@3f80 { + reg = <0x3f80 6>; + }; + + eth1_macaddress: macaddress@3f88 { + reg = <0x3f88 6>; + }; + }; +}; + +&gfx { + status = "okay"; +}; + +&pinctrl { + aspeed,external-nodes = <&gfx &lhc>; +}; + +&vhub { + status = "okay"; +}; + +&ehci1 { + status = "okay"; +}; + +&uhci { + status = "okay"; +}; + +&kcs3 { + aspeed,lpc-io-reg = <0xca2>; + status = "okay"; +}; + +&lpc_ctrl { + status = "okay"; +}; + +&lpc_snoop { + status = "okay"; + snoop-ports = <0x80>; +}; + +&p2a { + status = "okay"; + memory-region = <&pci_memory>; +}; + +&video { + status = "okay"; + memory-region = <&video_engine_memory>; +}; + +&pwm_tacho { + status = "okay"; + pinctrl-names = "default"; + pinctrl-0 = <&pinctrl_pwm0_default + &pinctrl_pwm1_default + &pinctrl_pwm2_default + &pinctrl_pwm3_default + &pinctrl_pwm4_default + &pinctrl_pwm5_default>; + + fan@0 { + /* FAN1 (4-pin) */ + reg = <0x00>; + aspeed,fan-tach-ch = /bits/ 8 <0x00>; + }; + + fan@1 { + /* FAN2 (4-pin) */ + reg = <0x01>; + aspeed,fan-tach-ch = /bits/ 8 <0x01>; + }; + + fan@2 { + /* FAN3 (4-pin) */ + reg = <0x02>; + aspeed,fan-tach-ch = /bits/ 8 <0x02>; + }; + + fan@3 { + /* FAN4 (6-pin) */ + reg = <0x03>; + aspeed,fan-tach-ch = /bits/ 8 <0x04 0x0b>; + }; + + fan@4 { + /* FAN6 (6-pin) */ + reg = <0x04>; + aspeed,fan-tach-ch = /bits/ 8 <0x06 0x0d>; + }; + + fan@5 { + /* FAN5 (6-pin) */ + reg = <0x05>; + aspeed,fan-tach-ch = /bits/ 8 <0x05 0x0c>; + }; +}; + +&adc { + status = "okay"; + pinctrl-names = "default"; + pinctrl-0 = <&pinctrl_adc0_default + &pinctrl_adc1_default + &pinctrl_adc2_default + &pinctrl_adc3_default + &pinctrl_adc4_default + &pinctrl_adc5_default + &pinctrl_adc6_default + &pinctrl_adc7_default + &pinctrl_adc8_default + &pinctrl_adc9_default + &pinctrl_adc10_default + &pinctrl_adc11_default + &pinctrl_adc12_default + &pinctrl_adc13_default + &pinctrl_adc14_default + &pinctrl_adc15_default>; +}; From 763f0b3f1402cbb9d1ec7a0a37bcc6ebb465b119 Mon Sep 17 00:00:00 2001 From: Peter Yin Date: Tue, 12 Dec 2023 00:26:54 +0800 Subject: [PATCH 012/707] dt-bindings: arm: aspeed: add Meta Harma board Document the new compatibles used on Meta Harma. Acked-by: Krzysztof Kozlowski Signed-off-by: Peter Yin Link: https://lore.kernel.org/r/20231211162656.2564267-2-peteryin.openbmc@gmail.com Signed-off-by: Joel Stanley --- Documentation/devicetree/bindings/arm/aspeed/aspeed.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/arm/aspeed/aspeed.yaml b/Documentation/devicetree/bindings/arm/aspeed/aspeed.yaml index 85e2c00a238477..7dfcdc2d571eb0 100644 --- a/Documentation/devicetree/bindings/arm/aspeed/aspeed.yaml +++ b/Documentation/devicetree/bindings/arm/aspeed/aspeed.yaml @@ -82,6 +82,7 @@ properties: - facebook,elbert-bmc - facebook,fuji-bmc - facebook,greatlakes-bmc + - facebook,harma-bmc - facebook,minerva-cmc - facebook,yosemite4-bmc - ibm,everest-bmc From e17770a3388e05aa59d6a8d4fbcd2a4130222db7 Mon Sep 17 00:00:00 2001 From: Peter Yin Date: Tue, 12 Dec 2023 00:26:55 +0800 Subject: [PATCH 013/707] ARM: dts: aspeed: Harma: Add Meta Harma (AST2600) BMC Add linux device tree entry related to the Meta(Facebook) computer-node system use an AT2600 BMC. This node is named "Harma". Signed-off-by: Peter Yin Reviewed-by: Joel Stanley Link: https://lore.kernel.org/r/20231211162656.2564267-3-peteryin.openbmc@gmail.com Signed-off-by: Joel Stanley --- arch/arm/boot/dts/aspeed/Makefile | 1 + .../dts/aspeed/aspeed-bmc-facebook-harma.dts | 585 ++++++++++++++++++ 2 files changed, 586 insertions(+) create mode 100644 arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-harma.dts diff --git a/arch/arm/boot/dts/aspeed/Makefile b/arch/arm/boot/dts/aspeed/Makefile index fb9cc95f1b60f3..6ecc21d04a6299 100644 --- a/arch/arm/boot/dts/aspeed/Makefile +++ b/arch/arm/boot/dts/aspeed/Makefile @@ -22,6 +22,7 @@ dtb-$(CONFIG_ARCH_ASPEED) += \ aspeed-bmc-facebook-fuji.dtb \ aspeed-bmc-facebook-galaxy100.dtb \ aspeed-bmc-facebook-greatlakes.dtb \ + aspeed-bmc-facebook-harma.dtb \ aspeed-bmc-facebook-minerva-cmc.dtb \ aspeed-bmc-facebook-minipack.dtb \ aspeed-bmc-facebook-tiogapass.dtb \ diff --git a/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-harma.dts b/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-harma.dts new file mode 100644 index 00000000000000..7db3f9eb00161a --- /dev/null +++ b/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-harma.dts @@ -0,0 +1,585 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +// Copyright 2023 Facebook Inc. + +/dts-v1/; +#include "aspeed-g6.dtsi" +#include +#include + +/ { + model = "Facebook Harma"; + compatible = "facebook,harma-bmc", "aspeed,ast2600"; + + aliases { + serial0 = &uart1; + serial1 = &uart6; + serial2 = &uart2; + serial4 = &uart5; + + i2c20 = &imux20; + i2c21 = &imux21; + i2c22 = &imux22; + i2c23 = &imux23; + i2c24 = &imux24; + i2c25 = &imux25; + i2c26 = &imux26; + i2c27 = &imux27; + i2c28 = &imux28; + i2c29 = &imux29; + i2c30 = &imux30; + i2c31 = &imux31; + }; + + chosen { + stdout-path = &uart5; + }; + + memory@80000000 { + device_type = "memory"; + reg = <0x80000000 0x80000000>; + }; + + iio-hwmon { + compatible = "iio-hwmon"; + io-channels = <&adc0 0>, <&adc0 1>, <&adc0 2>, <&adc0 3>, + <&adc0 4>, <&adc0 5>, <&adc0 6>, <&adc0 7>, + <&adc1 2>; + }; + + leds { + compatible = "gpio-leds"; + + led-0 { + label = "bmc_heartbeat_amber"; + gpios = <&gpio0 ASPEED_GPIO(P, 7) GPIO_ACTIVE_LOW>; + linux,default-trigger = "heartbeat"; + }; + + led-1 { + label = "fp_id_amber"; + default-state = "off"; + gpios = <&gpio0 13 GPIO_ACTIVE_HIGH>; + }; + + led-2 { + label = "power_blue"; + default-state = "off"; + gpios = <&gpio0 124 GPIO_ACTIVE_HIGH>; + }; + }; +}; + +// HOST BIOS Debug +&uart1 { + status = "okay"; +}; + +// SOL Host Console +&uart2 { + status = "okay"; + pinctrl-0 = <>; +}; + +// SOL BMC Console +&uart4 { + status = "okay"; + pinctrl-0 = <>; +}; + +// BMC Debug Console +&uart5 { + status = "okay"; +}; + +// MTIA +&uart6 { + status = "okay"; +}; + +&uart_routing { + status = "okay"; +}; + +&vuart1 { + status = "okay"; +}; + +&wdt1 { + status = "okay"; + pinctrl-names = "default"; + pinctrl-0 = <&pinctrl_wdtrst1_default>; + aspeed,reset-type = "soc"; + aspeed,external-signal; + aspeed,ext-push-pull; + aspeed,ext-active-high; + aspeed,ext-pulse-duration = <256>; +}; + +&mac3 { + status = "okay"; + pinctrl-names = "default"; + pinctrl-0 = <&pinctrl_rmii4_default>; + use-ncsi; + mlx,multi-host; +}; + +&rtc { + status = "okay"; +}; + +&fmc { + status = "okay"; + + flash@0 { + status = "okay"; + m25p,fast-read; + label = "bmc"; + spi-max-frequency = <50000000>; +#include "openbmc-flash-layout-128.dtsi" + }; + + flash@1 { + status = "okay"; + m25p,fast-read; + label = "alt-bmc"; + spi-max-frequency = <50000000>; + }; +}; + +// BIOS Flash +&spi2 { + status = "okay"; + pinctrl-names = "default"; + pinctrl-0 = <&pinctrl_spi2_default>; + + flash@0 { + status = "okay"; + m25p,fast-read; + label = "pnor"; + spi-max-frequency = <12000000>; + spi-tx-bus-width = <2>; + spi-rx-bus-width = <2>; + }; +}; + +&kcs2 { + status = "okay"; + aspeed,lpc-io-reg = <0xca8>; +}; + +&kcs3 { + status = "okay"; + aspeed,lpc-io-reg = <0xca2>; +}; + +&i2c0 { + status = "okay"; + + max31790@30{ + compatible = "max31790"; + reg = <0x30>; + #address-cells = <1>; + #size-cells = <0>; + }; +}; + +&i2c1 { + status = "okay"; + + tmp75@4b { + compatible = "ti,tmp75"; + reg = <0x4b>; + }; +}; + +&i2c2 { + status = "okay"; + + max31790@30{ + compatible = "max31790"; + reg = <0x30>; + #address-cells = <1>; + #size-cells = <0>; + }; +}; + +&i2c3 { + status = "okay"; + + i2c-mux@70 { + compatible = "nxp,pca9543"; + reg = <0x70>; + #address-cells = <1>; + #size-cells = <0>; + + imux20: i2c@0 { + #address-cells = <1>; + #size-cells = <0>; + reg = <0>; + //Retimer Flash + eeprom@50 { + compatible = "atmel,24c2048"; + reg = <0x50>; + pagesize = <128>; + }; + }; + imux21: i2c@1 { + #address-cells = <1>; + #size-cells = <0>; + reg = <1>; + }; + }; +}; + +&i2c4 { + status = "okay"; + // PDB FRU + eeprom@52 { + compatible = "atmel,24c64"; + reg = <0x52>; + }; + + delta_brick@69 { + compatible = "pmbus"; + reg = <0x69>; + }; +}; + +&i2c5 { + status = "okay"; +}; + +&i2c6 { + status = "okay"; + + i2c-mux@70 { + compatible = "nxp,pca9543"; + reg = <0x70>; + #address-cells = <1>; + #size-cells = <0>; + + imux22: i2c@0 { + #address-cells = <1>; + #size-cells = <0>; + reg = <0>; + }; + imux23: i2c@1 { + #address-cells = <1>; + #size-cells = <0>; + reg = <1>; + }; + }; +}; + +&i2c7 { + status = "okay"; +}; + +&i2c8 { + status = "okay"; +}; + +&i2c9 { + status = "okay"; + + gpio@30 { + compatible = "nxp,pca9555"; + reg = <0x30>; + gpio-controller; + #gpio-cells = <2>; + }; + gpio@31 { + compatible = "nxp,pca9555"; + reg = <0x31>; + gpio-controller; + #gpio-cells = <2>; + }; + + i2c-mux@71 { + compatible = "nxp,pca9546"; + reg = <0x71>; + #address-cells = <1>; + #size-cells = <0>; + + imux24: i2c@0 { + #address-cells = <1>; + #size-cells = <0>; + reg = <0>; + }; + imux25: i2c@1 { + #address-cells = <1>; + #size-cells = <0>; + reg = <1>; + }; + imux26: i2c@2 { + #address-cells = <1>; + #size-cells = <0>; + reg = <2>; + }; + imux27: i2c@3 { + #address-cells = <1>; + #size-cells = <0>; + reg = <3>; + }; + }; + // PTTV FRU + eeprom@52 { + compatible = "atmel,24c64"; + reg = <0x52>; + }; +}; + +&i2c11 { + status = "okay"; +}; + +&i2c12 { + status = "okay"; +}; + +&i2c13 { + status = "okay"; + + i2c-mux@70 { + compatible = "nxp,pca9545"; + reg = <0x70>; + #address-cells = <1>; + #size-cells = <0>; + + imux28: i2c@0 { + #address-cells = <1>; + #size-cells = <0>; + reg = <0>; + }; + imux29: i2c@1 { + #address-cells = <1>; + #size-cells = <0>; + reg = <1>; + //MB FRU + eeprom@54 { + compatible = "atmel,24c64"; + reg = <0x54>; + }; + }; + imux30: i2c@2 { + #address-cells = <1>; + #size-cells = <0>; + reg = <2>; + }; + imux31: i2c@3 { + #address-cells = <1>; + #size-cells = <0>; + reg = <3>; + }; + }; +}; + +// To Debug card +&i2c14 { + status = "okay"; + multi-master; + + ipmb@10 { + compatible = "ipmb-dev"; + reg = <(0x10 | I2C_OWN_SLAVE_ADDRESS)>; + i2c-protocol; + }; +}; + +&i2c15 { + status = "okay"; + + // SCM FRU + eeprom@50 { + compatible = "atmel,24c64"; + reg = <0x50>; + }; + + // BSM FRU + eeprom@56 { + compatible = "atmel,24c64"; + reg = <0x56>; + }; +}; + +&adc0 { + aspeed,int-vref-microvolt = <2500000>; + status = "okay"; + pinctrl-0 = <&pinctrl_adc0_default &pinctrl_adc1_default + &pinctrl_adc2_default &pinctrl_adc3_default + &pinctrl_adc4_default &pinctrl_adc5_default + &pinctrl_adc6_default &pinctrl_adc7_default>; +}; + +&adc1 { + aspeed,int-vref-microvolt = <2500000>; + status = "okay"; + pinctrl-names = "default"; + pinctrl-0 = <&pinctrl_adc10_default>; +}; + +&ehci0 { + status = "okay"; +}; + +&gpio0 { + pinctrl-names = "default"; + gpio-line-names = + /*A0-A7*/ "","","","","","","","", + /*B0-B7*/ "","","","", + "bmc-spi-mux-select-0","led-identify","","", + /*C0-C7*/ "","","","","","","","", + /*D0-D7*/ "","","sol-uart-select","","","","","", + /*E0-E7*/ "","","","","","","","", + /*F0-F7*/ "","","","","","","","", + /*G0-G7*/ "","","","","","","","", + /*H0-H7*/ "","","","","","","","", + /*I0-I7*/ "","","","","","","","", + /*J0-J7*/ "","","","","","","","", + /*K0-K7*/ "","","","","","","","", + /*L0-L7*/ "","","","","","","","", + /*M0-M7*/ "","","","","","","","", + /*N0-N7*/ "led-postcode-0","led-postcode-1", + "led-postcode-2","led-postcode-3", + "led-postcode-4","led-postcode-5", + "led-postcode-6","led-postcode-7", + /*O0-O7*/ "","","","","","","","", + /*P0-P7*/ "power-button","power-host-control", + "reset-button","","led-power","","","", + /*Q0-Q7*/ "","","","","","","","", + /*R0-R7*/ "","","","","","","","", + /*S0-S7*/ "","","","","","","","", + /*T0-T7*/ "","","","","","","","", + /*U0-U7*/ "","","","","","","led-identify-gate","", + /*V0-V7*/ "","","","", + "rtc-battery-voltage-read-enable","","","", + /*W0-W7*/ "","","","","","","","", + /*X0-X7*/ "","","","","","","","", + /*Y0-Y7*/ "","","","","","","","", + /*Z0-Z7*/ "","","","","","","",""; +}; + +&sgpiom0 { + status = "okay"; + max-ngpios = <128>; + ngpios = <128>; + bus-frequency = <2000000>; + gpio-line-names = + /*in - out - in - out */ + /*A0-A3 line 0-7*/ + "presence-scm-cable","power-config-disable-e1s-0", + "","", + "","power-config-disable-e1s-1", + "","", + /*A4-A7 line 8-15*/ + "","power-config-asic-module-enable", + "","power-config-asic-power-good", + "","power-config-pdb-power-good", + "presence-cpu","smi-control-n", + /*B0-B3 line 16-23*/ + "","nmi-control-n", + "","nmi-control-sync-flood-n", + "","", + "","", + /*B4-B7 line 24-31*/ + "","FM_CPU_SP5R1", + "reset-cause-rsmrst","FM_CPU_SP5R2", + "","FM_CPU_SP5R3", + "","FM_CPU_SP5R4", + /*C0-C3 line 32-39*/ + "","FM_CPU0_SA0", + "","FM_CPU0_SA1", + "","rt-cpu0-p0-enable", + "","rt-cpu0-p1-enable", + /*C4-C7 line 40-47*/ + "","smb-rt-rom-p0-select", + "","smb-rt-rom-p1-select", + "","i3c-cpu-mux0-oe-n", + "","i3c-cpu-mux0-select", + /*D0-D3 line 48-55*/ + "","i3c-cpu-mux1-oe-n", + "","i3c-cpu-mux1-select", + "","reset-control-bmc", + "","reset-control-cpu0-p0-mux", + /*D4-D7 line 56-63*/ + "","reset-control-cpu0-p1-mux", + "","reset-control-e1s-mux", + "power-host-good","reset-control-mb-mux", + "","reset-control-smb-e1s", + /*E0-E3 line 64-71*/ + "","reset-control-smb-e1s", + "host-ready-n","reset-control-srst", + "presence-e1s-0","reset-control-usb-hub", + "","reset-control", + /*E4-E7 line 72-79*/ + "presence-e1s-1","reset-control-cpu-kbrst", + "","reset-control-platrst", + "","bmc-jtag-mux-select-0", + "","bmc-jtag-mux-select-1", + /*F0-F3 line 80-87*/ + "","bmc-jtag-select", + "","bmc-ready-n", + "","bmc-ready-sgpio", + "","rt-cpu0-p0-force-enable", + /*F4-F7 line 88-95*/ + "presence-asic-modules-0","rt-cpu0-p1-force-enable", + "presence-asic-modules-1","bios-debug-msg-disable", + "","uart-control-buffer-select", + "","ac-control-n", + /*G0-G3 line 96-103*/ + "FM_CPU_CORETYPE2","", + "FM_CPU_CORETYPE1","", + "FM_CPU_CORETYPE0","", + "FM_BOARD_REV_ID5","", + /*G4-G7 line 104-111*/ + "FM_BOARD_REV_ID4","", + "FM_BOARD_REV_ID3","", + "FM_BOARD_REV_ID2","", + "FM_BOARD_REV_ID1","", + /*H0-H3 line 112-119*/ + "FM_BOARD_REV_ID0","", + "","","","","","", + /*H4-H7 line 120-127*/ + "","", + "reset-control-pcie-expansion-3","", + "reset-control-pcie-expansion-2","", + "reset-control-pcie-expansion-1","", + /*I0-I3 line 128-135*/ + "reset-control-pcie-expansion-0","", + "FM_EXP_SLOT_ID1","", + "FM_EXP_SLOT_ID0","", + "","", + /*I4-I7 line 136-143*/ + "","","","","","","","", + /*J0-J3 line 144-151*/ + "","","","","","","","", + /*J4-J7 line 152-159*/ + "SLOT_ID_BCB_0","", + "SLOT_ID_BCB_1","", + "SLOT_ID_BCB_2","", + "SLOT_ID_BCB_3","", + /*K0-K3 line 160-167*/ + "","","","","","","","", + /*K4-K7 line 168-175*/ + "","","","","","","","", + /*L0-L3 line 176-183*/ + "","","","","","","","", + /*L4-L7 line 184-191*/ + "","","","","","","","", + /*M0-M3 line 192-199*/ + "","","","","","","","", + /*M4-M7 line 200-207*/ + "","","","","","","","", + /*N0-N3 line 208-215*/ + "","","","","","","","", + /*N4-N7 line 216-223*/ + "","","","","","","","", + /*O0-O3 line 224-231*/ + "","","","","","","","", + /*O4-O7 line 232-239*/ + "","","","","","","","", + /*P0-P3 line 240-247*/ + "","","","","","","","", + /*P4-P7 line 248-255*/ + "","","","","","","",""; +}; From 965a8ea59ec55483fb9311036ef928d16770a63a Mon Sep 17 00:00:00 2001 From: Yang Chen Date: Tue, 12 Dec 2023 15:51:50 +0800 Subject: [PATCH 014/707] ARM: dts: aspeed: minerva: Revise the name of DTS The project Minerva which is the platform used by Meta has two boards: the Chassis Management Module (Minerva) and the Motherboard (Harma), so change the DTS name to minerva here for CMM use. Signed-off-by: Yang Chen Reviewed-by: Joel Stanley Link: https://lore.kernel.org/r/20231212075200.983536-2-yangchen.openbmc@gmail.com Signed-off-by: Joel Stanley --- arch/arm/boot/dts/aspeed/Makefile | 2 +- ...facebook-minerva-cmc.dts => aspeed-bmc-facebook-minerva.dts} | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) rename arch/arm/boot/dts/aspeed/{aspeed-bmc-facebook-minerva-cmc.dts => aspeed-bmc-facebook-minerva.dts} (99%) diff --git a/arch/arm/boot/dts/aspeed/Makefile b/arch/arm/boot/dts/aspeed/Makefile index 6ecc21d04a6299..75fff585675a0b 100644 --- a/arch/arm/boot/dts/aspeed/Makefile +++ b/arch/arm/boot/dts/aspeed/Makefile @@ -23,7 +23,7 @@ dtb-$(CONFIG_ARCH_ASPEED) += \ aspeed-bmc-facebook-galaxy100.dtb \ aspeed-bmc-facebook-greatlakes.dtb \ aspeed-bmc-facebook-harma.dtb \ - aspeed-bmc-facebook-minerva-cmc.dtb \ + aspeed-bmc-facebook-minerva.dtb \ aspeed-bmc-facebook-minipack.dtb \ aspeed-bmc-facebook-tiogapass.dtb \ aspeed-bmc-facebook-wedge40.dtb \ diff --git a/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva-cmc.dts b/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts similarity index 99% rename from arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva-cmc.dts rename to arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts index f04ef906352080..c755fb3258a485 100644 --- a/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva-cmc.dts +++ b/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts @@ -7,7 +7,7 @@ #include / { - model = "Facebook Minerva CMC"; + model = "Facebook Minerva CMM"; compatible = "facebook,minerva-cmc", "aspeed,ast2600"; aliases { From bbdcf72f21fd0e1cc118ad304a867fa863ce2a21 Mon Sep 17 00:00:00 2001 From: Yang Chen Date: Tue, 12 Dec 2023 15:51:51 +0800 Subject: [PATCH 015/707] ARM: dts: aspeed: minerva: Modify mac3 setting Remove the unuse setting and fix the link to 100 M Signed-off-by: Yang Chen Reviewed-by: Joel Stanley Link: https://lore.kernel.org/r/20231212075200.983536-3-yangchen.openbmc@gmail.com Signed-off-by: Joel Stanley --- arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts b/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts index c755fb3258a485..9979dba1ef0e4a 100644 --- a/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts +++ b/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts @@ -48,10 +48,13 @@ &mac3 { status = "okay"; + phy-mode = "rmii"; pinctrl-names = "default"; pinctrl-0 = <&pinctrl_rmii4_default>; - use-ncsi; - mlx,multi-host; + fixed-link { + speed = <100>; + full-duplex; + }; }; &fmc { From f15468aa4cdf3fe6568555d0f3e6df477f8b616d Mon Sep 17 00:00:00 2001 From: Yang Chen Date: Tue, 12 Dec 2023 15:51:52 +0800 Subject: [PATCH 016/707] ARM: dts: aspeed: minerva: Change sgpio use Correct the sgpio use from sgpiom1 to sgpiom0 Signed-off-by: Yang Chen Reviewed-by: Joel Stanley Link: https://lore.kernel.org/r/20231212075200.983536-4-yangchen.openbmc@gmail.com Signed-off-by: Joel Stanley --- arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts b/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts index 9979dba1ef0e4a..ad77057f921cd1 100644 --- a/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts +++ b/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts @@ -78,7 +78,7 @@ status = "okay"; }; -&sgpiom1 { +&sgpiom0 { status = "okay"; ngpios = <128>; bus-frequency = <2000000>; From aca2d2f36bf73b6f129d7626b60ddc1f58e91f6c Mon Sep 17 00:00:00 2001 From: Yang Chen Date: Tue, 12 Dec 2023 15:51:53 +0800 Subject: [PATCH 017/707] ARM: dts: aspeed: minerva: Enable power monitor device Enable power monitor device ina230 and ltc2945 on the i2c bus 0 Signed-off-by: Yang Chen Reviewed-by: Joel Stanley Link: https://lore.kernel.org/r/20231212075200.983536-5-yangchen.openbmc@gmail.com Signed-off-by: Joel Stanley --- .../aspeed/aspeed-bmc-facebook-minerva.dts | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts b/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts index ad77057f921cd1..ee9691647e4a50 100644 --- a/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts +++ b/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts @@ -86,6 +86,28 @@ &i2c0 { status = "okay"; + + power-monitor@40 { + compatible = "ti,ina230"; + reg = <0x40>; + shunt-resistor = <1000>; + }; + + power-monitor@41 { + compatible = "ti,ina230"; + reg = <0x41>; + shunt-resistor = <1000>; + }; + + power-monitor@67 { + compatible = "adi,ltc2945"; + reg = <0x67>; + }; + + power-monitor@68 { + compatible = "adi,ltc2945"; + reg = <0x68>; + }; }; &i2c1 { From 10f776c80b1a5d2834430c850457a1d10f12a9ce Mon Sep 17 00:00:00 2001 From: Yang Chen Date: Tue, 12 Dec 2023 15:51:54 +0800 Subject: [PATCH 018/707] ARM: dts: aspeed: minerva: Add temperature sensor Add one temperature sensor on i2c bus 1 Signed-off-by: Yang Chen Reviewed-by: Joel Stanley Link: https://lore.kernel.org/r/20231212075200.983536-6-yangchen.openbmc@gmail.com Signed-off-by: Joel Stanley --- arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts b/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts index ee9691647e4a50..783d4d5a8f3d7e 100644 --- a/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts +++ b/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts @@ -115,7 +115,12 @@ temperature-sensor@4b { compatible = "ti,tmp75"; - reg = <0x4B>; + reg = <0x4b>; + }; + + temperature-sensor@48 { + compatible = "ti,tmp75"; + reg = <0x48>; }; eeprom@51 { From 96b198848ecd07165f58d93e195ca63244c8dbd4 Mon Sep 17 00:00:00 2001 From: Yang Chen Date: Tue, 12 Dec 2023 15:51:55 +0800 Subject: [PATCH 019/707] ARM: dts: aspeed: minerva: correct the address of eeprom Correct the address from 0x51 to 0x54 of eeprom on the i2c bus 1 Signed-off-by: Yang Chen Reviewed-by: Joel Stanley Link: https://lore.kernel.org/r/20231212075200.983536-7-yangchen.openbmc@gmail.com Signed-off-by: Joel Stanley --- arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts b/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts index 783d4d5a8f3d7e..f2a48033ac5ce7 100644 --- a/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts +++ b/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts @@ -123,9 +123,9 @@ reg = <0x48>; }; - eeprom@51 { + eeprom@54 { compatible = "atmel,24c128"; - reg = <0x51>; + reg = <0x54>; }; }; From 0a40f5979a40e6fe43eaede7d3244dfde86653ee Mon Sep 17 00:00:00 2001 From: Yang Chen Date: Tue, 12 Dec 2023 15:51:56 +0800 Subject: [PATCH 020/707] ARM: dts: aspeed: minerva: add bus labels and aliases Add bus labels and aliases for the fan control board. Signed-off-by: Yang Chen Reviewed-by: Joel Stanley Link: https://lore.kernel.org/r/20231212075200.983536-8-yangchen.openbmc@gmail.com Signed-off-by: Joel Stanley --- .../aspeed/aspeed-bmc-facebook-minerva.dts | 22 ++++++++++++++----- 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts b/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts index f2a48033ac5ce7..f4cb5ef72310f9 100644 --- a/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts +++ b/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts @@ -12,6 +12,16 @@ aliases { serial5 = &uart5; + /* + * PCA9548 (2-0077) provides 8 channels connecting to + * 6 pcs of FCB (Fan Controller Board). + */ + i2c16 = &imux16; + i2c17 = &imux17; + i2c18 = &imux18; + i2c19 = &imux19; + i2c20 = &imux20; + i2c21 = &imux21; }; chosen { @@ -139,7 +149,7 @@ #size-cells = <0>; i2c-mux-idle-disconnect; - i2c@0 { + imux16: i2c@0 { #address-cells = <1>; #size-cells = <0>; reg = <0>; @@ -150,7 +160,7 @@ }; }; - i2c@1 { + imux17: i2c@1 { #address-cells = <1>; #size-cells = <0>; reg = <1>; @@ -161,7 +171,7 @@ }; }; - i2c@2 { + imux18: i2c@2 { #address-cells = <1>; #size-cells = <0>; reg = <2>; @@ -172,7 +182,7 @@ }; }; - i2c@3 { + imux19: i2c@3 { #address-cells = <1>; #size-cells = <0>; reg = <3>; @@ -183,7 +193,7 @@ }; }; - i2c@4 { + imux20: i2c@4 { #address-cells = <1>; #size-cells = <0>; reg = <4>; @@ -194,7 +204,7 @@ }; }; - i2c@5 { + imux21: i2c@5 { #address-cells = <1>; #size-cells = <0>; reg = <5>; From 165a1f2db3dd5cf7e7c6b65b0c0750412be75d5d Mon Sep 17 00:00:00 2001 From: Yang Chen Date: Tue, 12 Dec 2023 15:51:57 +0800 Subject: [PATCH 021/707] ARM: dts: aspeed: minerva: add fan rpm controller Add fan rpm controller max31790 on all bus of FCB. Signed-off-by: Yang Chen Reviewed-by: Joel Stanley Link: https://lore.kernel.org/r/20231212075200.983536-9-yangchen.openbmc@gmail.com Signed-off-by: Joel Stanley --- .../aspeed/aspeed-bmc-facebook-minerva.dts | 42 +++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts b/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts index f4cb5ef72310f9..c7445c819baf8e 100644 --- a/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts +++ b/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts @@ -158,6 +158,13 @@ compatible = "atmel,24c128"; reg = <0x50>; }; + + pwm@5e{ + compatible = "max31790"; + reg = <0x5e>; + #address-cells = <1>; + #size-cells = <0>; + }; }; imux17: i2c@1 { @@ -169,6 +176,13 @@ compatible = "atmel,24c128"; reg = <0x50>; }; + + pwm@5e{ + compatible = "max31790"; + reg = <0x5e>; + #address-cells = <1>; + #size-cells = <0>; + }; }; imux18: i2c@2 { @@ -180,6 +194,13 @@ compatible = "atmel,24c128"; reg = <0x50>; }; + + pwm@5e{ + compatible = "max31790"; + reg = <0x5e>; + #address-cells = <1>; + #size-cells = <0>; + }; }; imux19: i2c@3 { @@ -191,6 +212,13 @@ compatible = "atmel,24c128"; reg = <0x50>; }; + + pwm@5e{ + compatible = "max31790"; + reg = <0x5e>; + #address-cells = <1>; + #size-cells = <0>; + }; }; imux20: i2c@4 { @@ -202,6 +230,13 @@ compatible = "atmel,24c128"; reg = <0x50>; }; + + pwm@5e{ + compatible = "max31790"; + reg = <0x5e>; + #address-cells = <1>; + #size-cells = <0>; + }; }; imux21: i2c@5 { @@ -213,6 +248,13 @@ compatible = "atmel,24c128"; reg = <0x50>; }; + + pwm@5e{ + compatible = "max31790"; + reg = <0x5e>; + #address-cells = <1>; + #size-cells = <0>; + }; }; }; }; From f5dac195b500520257bc608f22937b928c2d0145 Mon Sep 17 00:00:00 2001 From: Yang Chen Date: Tue, 12 Dec 2023 15:51:58 +0800 Subject: [PATCH 022/707] ARM: dts: aspeed: minerva: Add led-fan-fault gpio Add led-fan-fault gpio pin on the PCA9555 on the i2c bus 0. Signed-off-by: Yang Chen Reviewed-by: Joel Stanley Link: https://lore.kernel.org/r/20231212075200.983536-10-yangchen.openbmc@gmail.com Signed-off-by: Joel Stanley --- .../dts/aspeed/aspeed-bmc-facebook-minerva.dts | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts b/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts index c7445c819baf8e..090fe2f6b1d897 100644 --- a/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts +++ b/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts @@ -39,6 +39,16 @@ <&adc0 4>, <&adc0 5>, <&adc0 6>, <&adc0 7>, <&adc1 2>; }; + + leds { + compatible = "gpio-leds"; + + led-fan-fault { + label = "led-fan-fault"; + gpios = <&leds_gpio 9 GPIO_ACTIVE_HIGH>; + default-state = "off"; + }; + }; }; &uart6 { @@ -118,6 +128,13 @@ compatible = "adi,ltc2945"; reg = <0x68>; }; + + leds_gpio: gpio@19 { + compatible = "nxp,pca9555"; + reg = <0x19>; + gpio-controller; + #gpio-cells = <2>; + }; }; &i2c1 { From 7a7ed4a02a945c3fa8689c67711ed1aa34aa2415 Mon Sep 17 00:00:00 2001 From: Yang Chen Date: Tue, 12 Dec 2023 15:51:59 +0800 Subject: [PATCH 023/707] ARM: dts: aspeed: minerva: add gpio line name Add the GPIO line name that the project's function can use by the meaningful name. Signed-off-by: Yang Chen Reviewed-by: Joel Stanley Link: https://lore.kernel.org/r/20231212075200.983536-11-yangchen.openbmc@gmail.com Signed-off-by: Joel Stanley --- .../aspeed/aspeed-bmc-facebook-minerva.dts | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts b/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts index 090fe2f6b1d897..31197183cc59e5 100644 --- a/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts +++ b/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts @@ -362,3 +362,33 @@ &uhci { status = "okay"; }; + +&gpio0 { + gpio-line-names = + /*A0-A7*/ "","","","","","","","", + /*B0-B7*/ "","","","","","","","", + /*C0-C7*/ "","","","","BLADE_UART_SEL2","","","", + /*D0-D7*/ "","","","","","","","", + /*E0-E7*/ "","","","","","","","", + /*F0-F7*/ "","","","","","","","", + /*G0-G7*/ "","","","","","","","", + /*H0-H7*/ "","","","","","","","", + /*I0-I7*/ "","","","","","","","", + /*J0-J7*/ "","","","","","","","", + /*K0-K7*/ "","","","","","","","", + /*L0-L7*/ "","","","","BLADE_UART_SEL0","","","", + /*M0-M7*/ "","","","","","BLADE_UART_SEL1","","", + /*N0-N7*/ "","","","","","","","", + /*O0-O7*/ "","","","","","","","", + /*P0-P7*/ "","","","","","","","", + /*Q0-Q7*/ "","","","","","","","", + /*R0-R7*/ "","","","","","","","", + /*S0-S7*/ "","","","","","","","", + /*T0-T7*/ "","","","","","","","", + /*U0-U7*/ "","","","","","","","", + /*V0-V7*/ "","","","","BAT_DETECT","","","", + /*W0-W7*/ "","","","","","","","", + /*X0-X7*/ "","","BLADE_UART_SEL3","","","","","", + /*Y0-Y7*/ "","","","","","","","", + /*Z0-Z7*/ "","","","","","","",""; +}; From e60f7a99d3789b5d0b24d3c0571b013309e56815 Mon Sep 17 00:00:00 2001 From: Yang Chen Date: Tue, 12 Dec 2023 15:52:00 +0800 Subject: [PATCH 024/707] ARM: dts: aspeed: minerva: add sgpio line name Add the SGPIO line name that the project's function can use by the meaningful name. Signed-off-by: Yang Chen Reviewed-by: Joel Stanley Link: https://lore.kernel.org/r/20231212075200.983536-12-yangchen.openbmc@gmail.com Signed-off-by: Joel Stanley --- .../aspeed/aspeed-bmc-facebook-minerva.dts | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) diff --git a/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts b/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts index 31197183cc59e5..942e53d5c71409 100644 --- a/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts +++ b/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts @@ -392,3 +392,152 @@ /*Y0-Y7*/ "","","","","","","","", /*Z0-Z7*/ "","","","","","","",""; }; + +&sgpiom0 { + gpio-line-names = + /*"input pin","output pin"*/ + /*A0 - A7*/ + "PRSNT_MTIA_BLADE0_N","PWREN_MTIA_BLADE0_EN", + "PRSNT_MTIA_BLADE1_N","PWREN_MTIA_BLADE1_EN", + "PRSNT_MTIA_BLADE2_N","PWREN_MTIA_BLADE2_EN", + "PRSNT_MTIA_BLADE3_N","PWREN_MTIA_BLADE3_EN", + "PRSNT_MTIA_BLADE4_N","PWREN_MTIA_BLADE4_EN", + "PRSNT_MTIA_BLADE5_N","PWREN_MTIA_BLADE5_EN", + "PRSNT_MTIA_BLADE6_N","PWREN_MTIA_BLADE6_EN", + "PRSNT_MTIA_BLADE7_N","PWREN_MTIA_BLADE7_EN", + /*B0 - B7*/ + "PRSNT_MTIA_BLADE8_N","PWREN_MTIA_BLADE8_EN", + "PRSNT_MTIA_BLADE9_N","PWREN_MTIA_BLADE9_EN", + "PRSNT_MTIA_BLADE10_N","PWREN_MTIA_BLADE10_EN", + "PRSNT_MTIA_BLADE11_N","PWREN_MTIA_BLADE11_EN", + "PRSNT_MTIA_BLADE12_N","PWREN_MTIA_BLADE12_EN", + "PRSNT_MTIA_BLADE13_N","PWREN_MTIA_BLADE13_EN", + "PRSNT_MTIA_BLADE14_N","PWREN_MTIA_BLADE14_EN", + "PRSNT_MTIA_BLADE15_N","PWREN_MTIA_BLADE15_EN", + /*C0 - C7*/ + "PRSNT_NW_BLADE0_N","PWREN_NW_BLADE0_EN", + "PRSNT_NW_BLADE1_N","PWREN_NW_BLADE1_EN", + "PRSNT_NW_BLADE2_N","PWREN_NW_BLADE2_EN", + "PRSNT_NW_BLADE3_N","PWREN_NW_BLADE3_EN", + "PRSNT_NW_BLADE4_N","PWREN_NW_BLADE4_EN", + "PRSNT_NW_BLADE5_N","PWREN_NW_BLADE5_EN", + "PRSNT_FCB_TOP_0_N","PWREN_MTIA_BLADE0_HSC_EN", + "PRSNT_FCB_TOP_1_N","PWREN_MTIA_BLADE1_HSC_EN", + /*D0 - D7*/ + "PRSNT_FCB_MIDDLE_0_N","PWREN_MTIA_BLADE2_HSC_EN", + "PRSNT_FCB_MIDDLE_1_N","PWREN_MTIA_BLADE3_HSC_EN", + "PRSNT_FCB_BOTTOM_0_N","PWREN_MTIA_BLADE4_HSC_EN", + "PRSNT_FCB_BOTTOM_1_N","PWREN_MTIA_BLADE5_HSC_EN", + "PWRGD_MTIA_BLADE0_PWROK_L_BUF","PWREN_MTIA_BLADE6_HSC_EN", + "PWRGD_MTIA_BLADE1_PWROK_L_BUF","PWREN_MTIA_BLADE7_HSC_EN", + "PWRGD_MTIA_BLADE2_PWROK_L_BUF","PWREN_MTIA_BLADE8_HSC_EN", + "PWRGD_MTIA_BLADE3_PWROK_L_BUF","PWREN_MTIA_BLADE9_HSC_EN", + /*E0 - E7*/ + "PWRGD_MTIA_BLADE4_PWROK_L_BUF","PWREN_MTIA_BLADE10_HSC_EN", + "PWRGD_MTIA_BLADE5_PWROK_L_BUF","PWREN_MTIA_BLADE11_HSC_EN", + "PWRGD_MTIA_BLADE6_PWROK_L_BUF","PWREN_MTIA_BLADE12_HSC_EN", + "PWRGD_MTIA_BLADE7_PWROK_L_BUF","PWREN_MTIA_BLADE13_HSC_EN", + "PWRGD_MTIA_BLADE8_PWROK_L_BUF","PWREN_MTIA_BLADE14_HSC_EN", + "PWRGD_MTIA_BLADE9_PWROK_L_BUF","PWREN_MTIA_BLADE15_HSC_EN", + "PWRGD_MTIA_BLADE10_PWROK_L_BUF","PWREN_NW_BLADE0_HSC_EN", + "PWRGD_MTIA_BLADE11_PWROK_L_BUF","PWREN_NW_BLADE1_HSC_EN", + /*F0 - F7*/ + "PWRGD_MTIA_BLADE12_PWROK_L_BUF","PWREN_NW_BLADE2_HSC_EN", + "PWRGD_MTIA_BLADE13_PWROK_L_BUF","PWREN_NW_BLADE3_HSC_EN", + "PWRGD_MTIA_BLADE14_PWROK_L_BUF","PWREN_NW_BLADE4_HSC_EN", + "PWRGD_MTIA_BLADE15_PWROK_L_BUF","PWREN_NW_BLADE5_HSC_EN", + "PWRGD_NW_BLADE0_PWROK_L_BUF","PWREN_FCB_TOP_L_EN", + "PWRGD_NW_BLADE1_PWROK_L_BUF","PWREN_FCB_TOP_R_EN", + "PWRGD_NW_BLADE2_PWROK_L_BUF","PWREN_FCB_MIDDLE_L_EN", + "PWRGD_NW_BLADE3_PWROK_L_BUF","PWREN_FCB_MIDDLE_R_EN", + /*G0 - G7*/ + "PWRGD_NW_BLADE4_PWROK_L_BUF","PWREN_FCB_BOTTOM_L_EN", + "PWRGD_NW_BLADE5_PWROK_L_BUF","PWREN_FCB_BOTTOM_R_EN", + "PWRGD_FCB_TOP_0_PWROK_L_BUF","FM_CMM_AC_CYCLE_N", + "PWRGD_FCB_TOP_1_PWROK_L_BUF","MGMT_SFP_TX_DIS", + "PWRGD_FCB_MIDDLE_0_PWROK_L_BUF","", + "PWRGD_FCB_MIDDLE_1_PWROK_L_BUF","RST_I2CRST_MTIA_BLADE0_1_N", + "PWRGD_FCB_BOTTOM_0_PWROK_L_BUF","RST_I2CRST_MTIA_BLADE2_3_N", + "PWRGD_FCB_BOTTOM_1_PWROK_L_BUF","RST_I2CRST_MTIA_BLADE4_5_N", + /*H0 - H7*/ + "LEAK_DETECT_MTIA_BLADE0_N_BUF","RST_I2CRST_MTIA_BLADE6_7_N", + "LEAK_DETECT_MTIA_BLADE1_N_BUF","RST_I2CRST_MTIA_BLADE8_9_N", + "LEAK_DETECT_MTIA_BLADE2_N_BUF","RST_I2CRST_MTIA_BLADE10_11_N", + "LEAK_DETECT_MTIA_BLADE3_N_BUF","RST_I2CRST_MTIA_BLADE12_13_N", + "LEAK_DETECT_MTIA_BLADE4_N_BUF","RST_I2CRST_MTIA_BLADE14_15_N", + "LEAK_DETECT_MTIA_BLADE5_N_BUF","RST_I2CRST_NW_BLADE0_1_2_N", + "LEAK_DETECT_MTIA_BLADE6_N_BUF","RST_I2CRST_NW_BLADE3_4_5_N", + "LEAK_DETECT_MTIA_BLADE7_N_BUF","RST_I2CRST_FCB_N", + /*I0 - I7*/ + "LEAK_DETECT_MTIA_BLADE8_N_BUF","RST_I2CRST_FCB_B_L_N", + "LEAK_DETECT_MTIA_BLADE9_N_BUF","RST_I2CRST_FCB_B_R_N", + "LEAK_DETECT_MTIA_BLADE10_N_BUF","RST_I2CRST_FCB_M_L_N", + "LEAK_DETECT_MTIA_BLADE11_N_BUF","RST_I2CRST_FCB_M_R_N", + "LEAK_DETECT_MTIA_BLADE12_N_BUF","RST_I2CRST_FCB_T_L_N", + "LEAK_DETECT_MTIA_BLADE13_N_BUF","RST_I2CRST_FCB_T_R_N", + "LEAK_DETECT_MTIA_BLADE14_N_BUF","BMC_READY", + "LEAK_DETECT_MTIA_BLADE15_N_BUF","wFM_88E6393X_BIN_UPDATE_EN_N", + /*J0 - J7*/ + "LEAK_DETECT_NW_BLADE0_N_BUF","WATER_VALVE_CLOSED_N", + "LEAK_DETECT_NW_BLADE1_N_BUF","", + "LEAK_DETECT_NW_BLADE2_N_BUF","", + "LEAK_DETECT_NW_BLADE3_N_BUF","", + "LEAK_DETECT_NW_BLADE4_N_BUF","", + "LEAK_DETECT_NW_BLADE5_N_BUF","", + "MTIA_BLADE0_STATUS_LED","", + "MTIA_BLADE1_STATUS_LED","", + /*K0 - K7*/ + "MTIA_BLADE2_STATUS_LED","", + "MTIA_BLADE3_STATUS_LED","", + "MTIA_BLADE4_STATUS_LED","", + "MTIA_BLADE5_STATUS_LED","", + "MTIA_BLADE6_STATUS_LED","", + "MTIA_BLADE7_STATUS_LED","", + "MTIA_BLADE8_STATUS_LED","", + "MTIA_BLADE9_STATUS_LED","", + /*L0 - L7*/ + "MTIA_BLADE10_STATUS_LED","", + "MTIA_BLADE11_STATUS_LED","", + "MTIA_BLADE12_STATUS_LED","", + "MTIA_BLADE13_STATUS_LED","", + "MTIA_BLADE14_STATUS_LED","", + "MTIA_BLADE15_STATUS_LED","", + "NW_BLADE0_STATUS_LED","", + "NW_BLADE1_STATUS_LED","", + /*M0 - M7*/ + "NW_BLADE2_STATUS_LED","", + "NW_BLADE3_STATUS_LED","", + "NW_BLADE4_STATUS_LED","", + "NW_BLADE5_STATUS_LED","", + "RPU_READY","", + "IT_GEAR_RPU_LINK_N","", + "IT_GEAR_LEAK","", + "WATER_VALVE_CLOSED_N","", + /*N0 - N7*/ + "VALVE_STS0","", + "VALVE_STS1","", + "VALVE_STS2","", + "VALVE_STS3","", + "CR_TOGGLE_BOOT_BUF_N","", + "CMM_LC_RDY_LED_N","", + "CMM_LC_UNRDY_LED_N","", + "CMM_CABLE_CARTRIDGE_PRSNT_BOT_N","", + /*O0 - O7*/ + "CMM_CABLE_CARTRIDGE_PRSNT_TOP_N","", + "BOT_BCB_CABLE_PRSNT_N","", + "TOP_BCB_CABLE_PRSNT_N","", + "CHASSIS0_LEAK_Q_N","", + "CHASSIS1_LEAK_Q_N","", + "LEAK0_DETECT","", + "LEAK1_DETECT","", + "MGMT_SFP_PRSNT_N","", + /*P0 - P7*/ + "MGMT_SFP_TX_FAULT","", + "MGMT_SFP_RX_LOS","", + "","", + "","", + "","", + "","", + "","", + "",""; +}; From 4c855a0c7d3100a47640e5c58f39b4e5b4ce30d6 Mon Sep 17 00:00:00 2001 From: David Heidelberg Date: Sat, 9 Dec 2023 18:15:41 +0100 Subject: [PATCH 025/707] ARM: tegra: Use gpio-fan matrix instead of array on Ouya No functional changes. Adjust to comply with dt-schema requirements and make possible to validate values. Reviewed-by: Linus Walleij Signed-off-by: David Heidelberg Signed-off-by: Thierry Reding --- arch/arm/boot/dts/nvidia/tegra30-ouya.dts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm/boot/dts/nvidia/tegra30-ouya.dts b/arch/arm/boot/dts/nvidia/tegra30-ouya.dts index 7e3de26ca960d2..c284dd0a55ab31 100644 --- a/arch/arm/boot/dts/nvidia/tegra30-ouya.dts +++ b/arch/arm/boot/dts/nvidia/tegra30-ouya.dts @@ -4611,8 +4611,8 @@ fan: fan { compatible = "gpio-fan"; gpios = <&gpio TEGRA_GPIO(J, 2) GPIO_ACTIVE_HIGH>; - gpio-fan,speed-map = <0 0 - 4500 1>; + gpio-fan,speed-map = <0 0>, + <4500 1>; #cooling-cells = <2>; }; From c4ae5addc4c53c504b1bdbd77ce72601ef51da8e Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 12 Nov 2023 08:04:14 +0100 Subject: [PATCH 026/707] soc/tegra: pmc: Remove some old and deprecated functions and constants These TEGRA_IO_RAIL_... functions and constants have been deprecated in commit 21b499105178 ("soc/tegra: pmc: Add I/O pad voltage support") in 2016-11. There seems to be no users since kernel 4.16. Remove them now. Signed-off-by: Christophe JAILLET Signed-off-by: Thierry Reding --- drivers/soc/tegra/pmc.c | 24 ------------------------ include/soc/tegra/pmc.h | 18 ------------------ 2 files changed, 42 deletions(-) diff --git a/drivers/soc/tegra/pmc.c b/drivers/soc/tegra/pmc.c index f432aa022ace06..6dfcc7f50ecea9 100644 --- a/drivers/soc/tegra/pmc.c +++ b/drivers/soc/tegra/pmc.c @@ -1777,30 +1777,6 @@ static int tegra_io_pad_get_voltage(struct tegra_pmc *pmc, enum tegra_io_pad id) return TEGRA_IO_PAD_VOLTAGE_3V3; } -/** - * tegra_io_rail_power_on() - enable power to I/O rail - * @id: Tegra I/O pad ID for which to enable power - * - * See also: tegra_io_pad_power_enable() - */ -int tegra_io_rail_power_on(unsigned int id) -{ - return tegra_io_pad_power_enable(id); -} -EXPORT_SYMBOL(tegra_io_rail_power_on); - -/** - * tegra_io_rail_power_off() - disable power to I/O rail - * @id: Tegra I/O pad ID for which to disable power - * - * See also: tegra_io_pad_power_disable() - */ -int tegra_io_rail_power_off(unsigned int id) -{ - return tegra_io_pad_power_disable(id); -} -EXPORT_SYMBOL(tegra_io_rail_power_off); - #ifdef CONFIG_PM_SLEEP enum tegra_suspend_mode tegra_pmc_get_suspend_mode(void) { diff --git a/include/soc/tegra/pmc.h b/include/soc/tegra/pmc.h index aadb845d281dd7..c545875d0ff18e 100644 --- a/include/soc/tegra/pmc.h +++ b/include/soc/tegra/pmc.h @@ -148,10 +148,6 @@ enum tegra_io_pad { TEGRA_IO_PAD_AO_HV, }; -/* deprecated, use TEGRA_IO_PAD_{HDMI,LVDS} instead */ -#define TEGRA_IO_RAIL_HDMI TEGRA_IO_PAD_HDMI -#define TEGRA_IO_RAIL_LVDS TEGRA_IO_PAD_LVDS - #ifdef CONFIG_SOC_TEGRA_PMC int tegra_powergate_power_on(unsigned int id); int tegra_powergate_power_off(unsigned int id); @@ -164,10 +160,6 @@ int tegra_powergate_sequence_power_up(unsigned int id, struct clk *clk, int tegra_io_pad_power_enable(enum tegra_io_pad id); int tegra_io_pad_power_disable(enum tegra_io_pad id); -/* deprecated, use tegra_io_pad_power_{enable,disable}() instead */ -int tegra_io_rail_power_on(unsigned int id); -int tegra_io_rail_power_off(unsigned int id); - void tegra_pmc_set_suspend_mode(enum tegra_suspend_mode mode); void tegra_pmc_enter_suspend_mode(enum tegra_suspend_mode mode); @@ -211,16 +203,6 @@ static inline int tegra_io_pad_get_voltage(enum tegra_io_pad id) return -ENOSYS; } -static inline int tegra_io_rail_power_on(unsigned int id) -{ - return -ENOSYS; -} - -static inline int tegra_io_rail_power_off(unsigned int id) -{ - return -ENOSYS; -} - static inline void tegra_pmc_set_suspend_mode(enum tegra_suspend_mode mode) { } From 29939185ad8939050e5d7c85939af9c2255aeea6 Mon Sep 17 00:00:00 2001 From: Kartik Date: Tue, 17 Oct 2023 10:53:15 +0530 Subject: [PATCH 027/707] mm/util: Introduce kmemdup_array() Introduce kmemdup_array() API to duplicate `n` number of elements from a given array. This internally uses kmemdup to allocate and duplicate the `src` array. Signed-off-by: Kartik Acked-by: Kees Cook Signed-off-by: Thierry Reding --- include/linux/string.h | 1 + mm/util.c | 17 +++++++++++++++++ 2 files changed, 18 insertions(+) diff --git a/include/linux/string.h b/include/linux/string.h index ce137830a0b99c..433c207a01daa6 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -220,6 +220,7 @@ extern char *kstrndup(const char *s, size_t len, gfp_t gfp); extern void *kmemdup(const void *src, size_t len, gfp_t gfp) __realloc_size(2); extern void *kvmemdup(const void *src, size_t len, gfp_t gfp) __realloc_size(2); extern char *kmemdup_nul(const char *s, size_t len, gfp_t gfp); +extern void *kmemdup_array(const void *src, size_t element_size, size_t count, gfp_t gfp); extern char **argv_split(gfp_t gfp, const char *str, int *argcp); extern void argv_free(char **argv); diff --git a/mm/util.c b/mm/util.c index aa01f6ea5a75b7..8d6d783520b946 100644 --- a/mm/util.c +++ b/mm/util.c @@ -135,6 +135,23 @@ void *kmemdup(const void *src, size_t len, gfp_t gfp) } EXPORT_SYMBOL(kmemdup); +/** + * kmemdup_array - duplicate a given array. + * + * @src: array to duplicate. + * @element_size: size of each element of array. + * @count: number of elements to duplicate from array. + * @gfp: GFP mask to use. + * + * Return: duplicated array of @src or %NULL in case of error, + * result is physically contiguous. Use kfree() to free. + */ +void *kmemdup_array(const void *src, size_t element_size, size_t count, gfp_t gfp) +{ + return kmemdup(src, size_mul(element_size, count), gfp); +} +EXPORT_SYMBOL(kmemdup_array); + /** * kvmemdup - duplicate region of memory * From c89a004b57ce1dbb0a906c8e8c6fe8f4bfae3ab1 Mon Sep 17 00:00:00 2001 From: Kartik Date: Tue, 17 Oct 2023 10:53:16 +0530 Subject: [PATCH 028/707] soc/tegra: fuse: Use dev_err_probe for probe failures Currently, in tegra_fuse_probe() if clock/reset get fails, then the driver prints an error if the error is not caused by -EPROBE_DEFER. This can be improved by using dev_err_probe() instead. So, return dev_err_probe() if clock/reset get fails. Signed-off-by: Kartik Signed-off-by: Thierry Reding --- drivers/soc/tegra/fuse/fuse-tegra.c | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/drivers/soc/tegra/fuse/fuse-tegra.c b/drivers/soc/tegra/fuse/fuse-tegra.c index a2c28f493a75e5..98805885158e9d 100644 --- a/drivers/soc/tegra/fuse/fuse-tegra.c +++ b/drivers/soc/tegra/fuse/fuse-tegra.c @@ -131,13 +131,8 @@ static int tegra_fuse_probe(struct platform_device *pdev) fuse->phys = res->start; fuse->clk = devm_clk_get(&pdev->dev, "fuse"); - if (IS_ERR(fuse->clk)) { - if (PTR_ERR(fuse->clk) != -EPROBE_DEFER) - dev_err(&pdev->dev, "failed to get FUSE clock: %ld", - PTR_ERR(fuse->clk)); - - return PTR_ERR(fuse->clk); - } + if (IS_ERR(fuse->clk)) + return dev_err_probe(&pdev->dev, PTR_ERR(fuse->clk), "failed to get FUSE clock\n"); platform_set_drvdata(pdev, fuse); fuse->dev = &pdev->dev; @@ -179,12 +174,8 @@ static int tegra_fuse_probe(struct platform_device *pdev) } fuse->rst = devm_reset_control_get_optional(&pdev->dev, "fuse"); - if (IS_ERR(fuse->rst)) { - err = PTR_ERR(fuse->rst); - dev_err(&pdev->dev, "failed to get FUSE reset: %pe\n", - fuse->rst); - return err; - } + if (IS_ERR(fuse->rst)) + return dev_err_probe(&pdev->dev, PTR_ERR(fuse->rst), "failed to get FUSE reset\n"); /* * FUSE clock is enabled at a boot time, hence this resume/suspend From 650324faffbe3f5342de1df0979258059dd881ed Mon Sep 17 00:00:00 2001 From: Kartik Date: Tue, 17 Oct 2023 10:53:17 +0530 Subject: [PATCH 029/707] soc/tegra: fuse: Refactor resource mapping MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To prepare for adding ACPI support to the tegra-apbmisc driver, relocate the code responsible for mapping memory resources from the function ‘tegra_init_apbmisc’ to the function ‘tegra_init_apbmisc_resources.’ This adjustment will allow the code to be shared between ‘tegra_init_apbmisc’ and the upcoming ‘tegra_acpi_init_apbmisc’ function. Signed-off-by: Kartik Signed-off-by: Thierry Reding --- drivers/soc/tegra/fuse/tegra-apbmisc.c | 37 +++++++++++++++----------- 1 file changed, 21 insertions(+), 16 deletions(-) diff --git a/drivers/soc/tegra/fuse/tegra-apbmisc.c b/drivers/soc/tegra/fuse/tegra-apbmisc.c index da970f3dbf3562..06c1b3a2c7eccc 100644 --- a/drivers/soc/tegra/fuse/tegra-apbmisc.c +++ b/drivers/soc/tegra/fuse/tegra-apbmisc.c @@ -160,9 +160,28 @@ void __init tegra_init_revision(void) tegra_sku_info.platform = tegra_get_platform(); } -void __init tegra_init_apbmisc(void) +static void tegra_init_apbmisc_resources(struct resource *apbmisc, + struct resource *straps) { void __iomem *strapping_base; + + apbmisc_base = ioremap(apbmisc->start, resource_size(apbmisc)); + if (apbmisc_base) + chipid = readl_relaxed(apbmisc_base + 4); + else + pr_err("failed to map APBMISC registers\n"); + + strapping_base = ioremap(straps->start, resource_size(straps)); + if (strapping_base) { + strapping = readl_relaxed(strapping_base); + iounmap(strapping_base); + } else { + pr_err("failed to map strapping options registers\n"); + } +} + +void __init tegra_init_apbmisc(void) +{ struct resource apbmisc, straps; struct device_node *np; @@ -219,21 +238,7 @@ void __init tegra_init_apbmisc(void) } } - apbmisc_base = ioremap(apbmisc.start, resource_size(&apbmisc)); - if (!apbmisc_base) { - pr_err("failed to map APBMISC registers\n"); - } else { - chipid = readl_relaxed(apbmisc_base + 4); - } - - strapping_base = ioremap(straps.start, resource_size(&straps)); - if (!strapping_base) { - pr_err("failed to map strapping options registers\n"); - } else { - strapping = readl_relaxed(strapping_base); - iounmap(strapping_base); - } - + tegra_init_apbmisc_resources(&apbmisc, &straps); long_ram_code = of_property_read_bool(np, "nvidia,long-ram-code"); put: From ca7b63e8b99c647788375b5e8668424c5096cb50 Mon Sep 17 00:00:00 2001 From: Kartik Date: Tue, 17 Oct 2023 10:53:18 +0530 Subject: [PATCH 030/707] soc/tegra: fuse: Add tegra_acpi_init_apbmisc() In preparation to ACPI support in Tegra fuse driver add function tegra_acpi_init_apbmisc() to initialize tegra-apbmisc driver. Also, document the reason of calling tegra_init_apbmisc() at early init. Note that function tegra_acpi_init_apbmisc() is not placed in the __init section, because it will be called during probe. Signed-off-by: Kartik Signed-off-by: Thierry Reding --- drivers/soc/tegra/fuse/fuse.h | 1 + drivers/soc/tegra/fuse/tegra-apbmisc.c | 72 ++++++++++++++++++++++++++ 2 files changed, 73 insertions(+) diff --git a/drivers/soc/tegra/fuse/fuse.h b/drivers/soc/tegra/fuse/fuse.h index 90f23be738947a..a41e9f85281aa8 100644 --- a/drivers/soc/tegra/fuse/fuse.h +++ b/drivers/soc/tegra/fuse/fuse.h @@ -69,6 +69,7 @@ struct tegra_fuse { void tegra_init_revision(void); void tegra_init_apbmisc(void); +void tegra_acpi_init_apbmisc(void); u32 __init tegra_fuse_read_spare(unsigned int spare); u32 __init tegra_fuse_read_early(unsigned int offset); diff --git a/drivers/soc/tegra/fuse/tegra-apbmisc.c b/drivers/soc/tegra/fuse/tegra-apbmisc.c index 06c1b3a2c7eccc..6457f80821bb95 100644 --- a/drivers/soc/tegra/fuse/tegra-apbmisc.c +++ b/drivers/soc/tegra/fuse/tegra-apbmisc.c @@ -3,9 +3,11 @@ * Copyright (c) 2014-2023, NVIDIA CORPORATION. All rights reserved. */ +#include #include #include #include +#include #include #include @@ -180,6 +182,12 @@ static void tegra_init_apbmisc_resources(struct resource *apbmisc, } } +/** + * tegra_init_apbmisc - Initializes Tegra APBMISC and Strapping registers. + * + * This is called during early init as some of the old 32-bit ARM code needs + * information from the APBMISC registers very early during boot. + */ void __init tegra_init_apbmisc(void) { struct resource apbmisc, straps; @@ -244,3 +252,67 @@ void __init tegra_init_apbmisc(void) put: of_node_put(np); } + +#ifdef CONFIG_ACPI +static const struct acpi_device_id apbmisc_acpi_match[] = { + { "NVDA2010" }, + { /* sentinel */ } +}; + +void tegra_acpi_init_apbmisc(void) +{ + struct resource *resources[2] = { NULL }; + struct resource_entry *rentry; + struct acpi_device *adev = NULL; + struct list_head resource_list; + int rcount = 0; + int ret; + + adev = acpi_dev_get_first_match_dev(apbmisc_acpi_match[0].id, NULL, -1); + if (!adev) + return; + + INIT_LIST_HEAD(&resource_list); + + ret = acpi_dev_get_memory_resources(adev, &resource_list); + if (ret < 0) { + pr_err("failed to get APBMISC memory resources"); + goto out_put_acpi_dev; + } + + /* + * Get required memory resources. + * + * resources[0]: apbmisc. + * resources[1]: straps. + */ + resource_list_for_each_entry(rentry, &resource_list) { + if (rcount >= ARRAY_SIZE(resources)) + break; + + resources[rcount++] = rentry->res; + } + + if (!resources[0]) { + pr_err("failed to get APBMISC registers\n"); + goto out_free_resource_list; + } + + if (!resources[1]) { + pr_err("failed to get strapping options registers\n"); + goto out_free_resource_list; + } + + tegra_init_apbmisc_resources(resources[0], resources[1]); + +out_free_resource_list: + acpi_dev_free_resource_list(&resource_list); + +out_put_acpi_dev: + acpi_dev_put(adev); +} +#else +void tegra_acpi_init_apbmisc(void) +{ +} +#endif From 8a2ac683cbf4d11a12f6bb67b2afc9109078568a Mon Sep 17 00:00:00 2001 From: Kartik Date: Tue, 17 Oct 2023 10:53:19 +0530 Subject: [PATCH 031/707] soc/tegra: fuse: Add function to add lookups Add helper function tegra_fuse_add_lookups() to register Tegra fuse nvmem lookups. So, this can be shared between tegra_fuse_init() and ACPI probe, which is to be introduced later. Use kmemdup_array to duplicate fuse->soc->lookups. Signed-off-by: Kartik Signed-off-by: Thierry Reding --- drivers/soc/tegra/fuse/fuse-tegra.c | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/drivers/soc/tegra/fuse/fuse-tegra.c b/drivers/soc/tegra/fuse/fuse-tegra.c index 98805885158e9d..4ebb5597a77b7e 100644 --- a/drivers/soc/tegra/fuse/fuse-tegra.c +++ b/drivers/soc/tegra/fuse/fuse-tegra.c @@ -113,6 +113,18 @@ static void tegra_fuse_restore(void *base) fuse->clk = NULL; } +static int tegra_fuse_add_lookups(struct tegra_fuse *fuse) +{ + fuse->lookups = kmemdup_array(fuse->soc->lookups, sizeof(*fuse->lookups), + fuse->soc->num_lookups, GFP_KERNEL); + if (!fuse->lookups) + return -ENOMEM; + + nvmem_add_cell_lookups(fuse->lookups, fuse->soc->num_lookups); + + return 0; +} + static int tegra_fuse_probe(struct platform_device *pdev) { void __iomem *base = fuse->base; @@ -398,6 +410,7 @@ static int __init tegra_init_fuse(void) const struct of_device_id *match; struct device_node *np; struct resource regs; + int err; tegra_init_apbmisc(); @@ -495,15 +508,11 @@ static int __init tegra_init_fuse(void) pr_debug("Tegra CPU Speedo ID %d, SoC Speedo ID %d\n", tegra_sku_info.cpu_speedo_id, tegra_sku_info.soc_speedo_id); - if (fuse->soc->lookups) { - size_t size = sizeof(*fuse->lookups) * fuse->soc->num_lookups; - - fuse->lookups = kmemdup(fuse->soc->lookups, size, GFP_KERNEL); - if (fuse->lookups) - nvmem_add_cell_lookups(fuse->lookups, fuse->soc->num_lookups); - } + err = tegra_fuse_add_lookups(fuse); + if (err) + pr_err("failed to add FUSE lookups\n"); - return 0; + return err; } early_initcall(tegra_init_fuse); From bf66ea5d1fe29acc681a3422f5d1967b7486acac Mon Sep 17 00:00:00 2001 From: Kartik Date: Tue, 17 Oct 2023 10:53:20 +0530 Subject: [PATCH 032/707] soc/tegra: fuse: Add function to print SKU info Add helper function tegra_fuse_print_sku_info() to print Tegra SKU information. So, it can be shared between tegra_fuse_init() and ACPI probe which is to be introduced later. Signed-off-by: Kartik Signed-off-by: Thierry Reding --- drivers/soc/tegra/fuse/fuse-tegra.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/drivers/soc/tegra/fuse/fuse-tegra.c b/drivers/soc/tegra/fuse/fuse-tegra.c index 4ebb5597a77b7e..7a93c6512f7b53 100644 --- a/drivers/soc/tegra/fuse/fuse-tegra.c +++ b/drivers/soc/tegra/fuse/fuse-tegra.c @@ -113,6 +113,16 @@ static void tegra_fuse_restore(void *base) fuse->clk = NULL; } +static void tegra_fuse_print_sku_info(struct tegra_sku_info *tegra_sku_info) +{ + pr_info("Tegra Revision: %s SKU: %d CPU Process: %d SoC Process: %d\n", + tegra_revision_name[tegra_sku_info->revision], + tegra_sku_info->sku_id, tegra_sku_info->cpu_process_id, + tegra_sku_info->soc_process_id); + pr_debug("Tegra CPU Speedo ID %d, SoC Speedo ID %d\n", + tegra_sku_info->cpu_speedo_id, tegra_sku_info->soc_speedo_id); +} + static int tegra_fuse_add_lookups(struct tegra_fuse *fuse) { fuse->lookups = kmemdup_array(fuse->soc->lookups, sizeof(*fuse->lookups), @@ -501,12 +511,7 @@ static int __init tegra_init_fuse(void) fuse->soc->init(fuse); - pr_info("Tegra Revision: %s SKU: %d CPU Process: %d SoC Process: %d\n", - tegra_revision_name[tegra_sku_info.revision], - tegra_sku_info.sku_id, tegra_sku_info.cpu_process_id, - tegra_sku_info.soc_process_id); - pr_debug("Tegra CPU Speedo ID %d, SoC Speedo ID %d\n", - tegra_sku_info.cpu_speedo_id, tegra_sku_info.soc_speedo_id); + tegra_fuse_print_sku_info(&tegra_sku_info); err = tegra_fuse_add_lookups(fuse); if (err) From c5b2d43e67bb8de33e0e8216d7703fdcd62227bf Mon Sep 17 00:00:00 2001 From: Kartik Date: Tue, 17 Oct 2023 10:53:21 +0530 Subject: [PATCH 033/707] soc/tegra: fuse: Add ACPI support for Tegra194 and Tegra234 Add ACPI support for Tegra194 & Tegra243 SoC's. This requires following modifications to the probe when ACPI boot is used: - Initialize soc data. - Add nvmem lookups. - Register soc device. - use devm_clk_get_optional() instead of devm_clk_get() to get fuse->clk, as fuse clocks are not required when using ACPI boot. Also, drop '__init' keyword for tegra_soc_device_register() as this is also used by tegra_fuse_probe() and use dev_err_probe() wherever applicable. Signed-off-by: Kartik Signed-off-by: Thierry Reding --- drivers/soc/tegra/fuse/fuse-tegra.c | 52 +++++++++++++++++++++++++++-- 1 file changed, 49 insertions(+), 3 deletions(-) diff --git a/drivers/soc/tegra/fuse/fuse-tegra.c b/drivers/soc/tegra/fuse/fuse-tegra.c index 7a93c6512f7b53..1c758f121f916c 100644 --- a/drivers/soc/tegra/fuse/fuse-tegra.c +++ b/drivers/soc/tegra/fuse/fuse-tegra.c @@ -3,11 +3,13 @@ * Copyright (c) 2013-2023, NVIDIA CORPORATION. All rights reserved. */ +#include #include #include #include #include #include +#include #include #include #include @@ -152,7 +154,38 @@ static int tegra_fuse_probe(struct platform_device *pdev) return PTR_ERR(fuse->base); fuse->phys = res->start; - fuse->clk = devm_clk_get(&pdev->dev, "fuse"); + /* Initialize the soc data and lookups if using ACPI boot. */ + if (is_acpi_node(dev_fwnode(&pdev->dev)) && !fuse->soc) { + u8 chip; + + tegra_acpi_init_apbmisc(); + + chip = tegra_get_chip_id(); + switch (chip) { +#if defined(CONFIG_ARCH_TEGRA_194_SOC) + case TEGRA194: + fuse->soc = &tegra194_fuse_soc; + break; +#endif +#if defined(CONFIG_ARCH_TEGRA_234_SOC) + case TEGRA234: + fuse->soc = &tegra234_fuse_soc; + break; +#endif + default: + return dev_err_probe(&pdev->dev, -EINVAL, "Unsupported SoC: %02x\n", chip); + } + + fuse->soc->init(fuse); + tegra_fuse_print_sku_info(&tegra_sku_info); + tegra_soc_device_register(); + + err = tegra_fuse_add_lookups(fuse); + if (err) + return dev_err_probe(&pdev->dev, err, "failed to add FUSE lookups\n"); + } + + fuse->clk = devm_clk_get_optional(&pdev->dev, "fuse"); if (IS_ERR(fuse->clk)) return dev_err_probe(&pdev->dev, PTR_ERR(fuse->clk), "failed to get FUSE clock\n"); @@ -275,10 +308,17 @@ static const struct dev_pm_ops tegra_fuse_pm = { SET_SYSTEM_SLEEP_PM_OPS(tegra_fuse_suspend, tegra_fuse_resume) }; +static const struct acpi_device_id tegra_fuse_acpi_match[] = { + { "NVDA200F" }, + { /* sentinel */ } +}; +MODULE_DEVICE_TABLE(acpi, tegra_fuse_acpi_match); + static struct platform_driver tegra_fuse_driver = { .driver = { .name = "tegra-fuse", .of_match_table = tegra_fuse_match, + .acpi_match_table = tegra_fuse_acpi_match, .pm = &tegra_fuse_pm, .suppress_bind_attrs = true, }, @@ -300,7 +340,13 @@ u32 __init tegra_fuse_read_early(unsigned int offset) int tegra_fuse_readl(unsigned long offset, u32 *value) { - if (!fuse->read || !fuse->clk) + /* + * Wait for fuse->clk to be initialized if device-tree boot is used. + */ + if (is_of_node(dev_fwnode(fuse->dev)) && !fuse->clk) + return -EPROBE_DEFER; + + if (!fuse->read) return -EPROBE_DEFER; if (IS_ERR(fuse->clk)) @@ -383,7 +429,7 @@ const struct attribute_group tegra194_soc_attr_group = { }; #endif -struct device * __init tegra_soc_device_register(void) +struct device *tegra_soc_device_register(void) { struct soc_device_attribute *attr; struct soc_device *dev; From dee509eb9cd593b7bcb1c1f1f5f2d7e75e389290 Mon Sep 17 00:00:00 2001 From: Kartik Date: Tue, 17 Oct 2023 10:53:22 +0530 Subject: [PATCH 034/707] soc/tegra: fuse: Add support for Tegra241 Add support for Tegra241 which use ACPI boot. Signed-off-by: Kartik Signed-off-by: Thierry Reding --- drivers/soc/tegra/Kconfig | 5 +++++ drivers/soc/tegra/fuse/fuse-tegra.c | 5 +++++ drivers/soc/tegra/fuse/fuse-tegra30.c | 20 ++++++++++++++++++++ drivers/soc/tegra/fuse/fuse.h | 4 ++++ drivers/soc/tegra/fuse/tegra-apbmisc.c | 1 + include/soc/tegra/fuse.h | 1 + 6 files changed, 36 insertions(+) diff --git a/drivers/soc/tegra/Kconfig b/drivers/soc/tegra/Kconfig index f16beeabaa92bb..33512558af9f7f 100644 --- a/drivers/soc/tegra/Kconfig +++ b/drivers/soc/tegra/Kconfig @@ -133,6 +133,11 @@ config ARCH_TEGRA_234_SOC help Enable support for the NVIDIA Tegra234 SoC. +config ARCH_TEGRA_241_SOC + bool "NVIDIA Tegra241 SoC" + help + Enable support for the NVIDIA Tegra241 SoC. + endif endif diff --git a/drivers/soc/tegra/fuse/fuse-tegra.c b/drivers/soc/tegra/fuse/fuse-tegra.c index 1c758f121f916c..233b8e7bb41bf9 100644 --- a/drivers/soc/tegra/fuse/fuse-tegra.c +++ b/drivers/soc/tegra/fuse/fuse-tegra.c @@ -171,6 +171,11 @@ static int tegra_fuse_probe(struct platform_device *pdev) case TEGRA234: fuse->soc = &tegra234_fuse_soc; break; +#endif +#if defined(CONFIG_ARCH_TEGRA_241_SOC) + case TEGRA241: + fuse->soc = &tegra241_fuse_soc; + break; #endif default: return dev_err_probe(&pdev->dev, -EINVAL, "Unsupported SoC: %02x\n", chip); diff --git a/drivers/soc/tegra/fuse/fuse-tegra30.c b/drivers/soc/tegra/fuse/fuse-tegra30.c index e94d46372a6396..2070d36c510dcb 100644 --- a/drivers/soc/tegra/fuse/fuse-tegra30.c +++ b/drivers/soc/tegra/fuse/fuse-tegra30.c @@ -678,3 +678,23 @@ const struct tegra_fuse_soc tegra234_fuse_soc = { .clk_suspend_on = false, }; #endif + +#if defined(CONFIG_ARCH_TEGRA_241_SOC) +static const struct tegra_fuse_info tegra241_fuse_info = { + .read = tegra30_fuse_read, + .size = 0x16008, + .spare = 0xcf0, +}; + +static const struct nvmem_keepout tegra241_fuse_keepouts[] = { + { .start = 0xc, .end = 0x1600c } +}; + +const struct tegra_fuse_soc tegra241_fuse_soc = { + .init = tegra30_fuse_init, + .info = &tegra241_fuse_info, + .keepouts = tegra241_fuse_keepouts, + .num_keepouts = ARRAY_SIZE(tegra241_fuse_keepouts), + .soc_attr_group = &tegra194_soc_attr_group, +}; +#endif diff --git a/drivers/soc/tegra/fuse/fuse.h b/drivers/soc/tegra/fuse/fuse.h index a41e9f85281aa8..f3b705327c20f8 100644 --- a/drivers/soc/tegra/fuse/fuse.h +++ b/drivers/soc/tegra/fuse/fuse.h @@ -136,4 +136,8 @@ extern const struct tegra_fuse_soc tegra194_fuse_soc; extern const struct tegra_fuse_soc tegra234_fuse_soc; #endif +#ifdef CONFIG_ARCH_TEGRA_241_SOC +extern const struct tegra_fuse_soc tegra241_fuse_soc; +#endif + #endif diff --git a/drivers/soc/tegra/fuse/tegra-apbmisc.c b/drivers/soc/tegra/fuse/tegra-apbmisc.c index 6457f80821bb95..e2ca5d55fd3125 100644 --- a/drivers/soc/tegra/fuse/tegra-apbmisc.c +++ b/drivers/soc/tegra/fuse/tegra-apbmisc.c @@ -64,6 +64,7 @@ bool tegra_is_silicon(void) switch (tegra_get_chip_id()) { case TEGRA194: case TEGRA234: + case TEGRA241: case TEGRA264: if (tegra_get_platform() == 0) return true; diff --git a/include/soc/tegra/fuse.h b/include/soc/tegra/fuse.h index 3a513be502437f..8f421b9f7585ca 100644 --- a/include/soc/tegra/fuse.h +++ b/include/soc/tegra/fuse.h @@ -17,6 +17,7 @@ #define TEGRA186 0x18 #define TEGRA194 0x19 #define TEGRA234 0x23 +#define TEGRA241 0x24 #define TEGRA264 0x26 #define TEGRA_FUSE_SKU_CALIB_0 0xf0 From 58d4cb6b73be15a9119b24d5d8f581eaa0cc9ff0 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 15 Dec 2023 10:01:44 -0500 Subject: [PATCH 035/707] btrfs: do not allow non subvolume root targets for snapshot Our btrfs subvolume snapshot utility enforces that is the root of the subvolume, however this isn't enforced in the kernel. Update the kernel to also enforce this limitation to avoid problems with other users of this ioctl that don't have the appropriate checks in place. Reported-by: Martin Michaelis CC: stable@vger.kernel.org # 4.14+ Reviewed-by: Neal Gompa Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 4e50b62db2a8fe..a1743904202b78 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1290,6 +1290,15 @@ static noinline int __btrfs_ioctl_snap_create(struct file *file, * are limited to own subvolumes only */ ret = -EPERM; + } else if (btrfs_ino(BTRFS_I(src_inode)) != BTRFS_FIRST_FREE_OBJECTID) { + /* + * Snapshots must be made with the src_inode referring + * to the subvolume inode, otherwise the permission + * checking above is useless because we may have + * permission on a lower directory but not the subvol + * itself. + */ + ret = -EINVAL; } else { ret = btrfs_mksnapshot(&file->f_path, idmap, name, namelen, From f7b487648986ecc0510996c5c638c61f5f811ccc Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Mon, 11 Dec 2023 18:27:15 -0800 Subject: [PATCH 036/707] lib/find: add atomic find_bit() primitives Add helpers around test_and_{set,clear}_bit() that allow to search for clear or set bits and flip them atomically. The target patterns may look like this: for (idx = 0; idx < nbits; idx++) if (test_and_clear_bit(idx, bitmap)) do_something(idx); Or like this: do { bit = find_first_bit(bitmap, nbits); if (bit >= nbits) return nbits; } while (!test_and_clear_bit(bit, bitmap)); return bit; In both cases, the opencoded loop may be converted to a single function or iterator call. Correspondingly: for_each_test_and_clear_bit(idx, bitmap, nbits) do_something(idx); Or: return find_and_clear_bit(bitmap, nbits); Obviously, the less routine code people have to write themself, the less probability to make a mistake. Those are not only handy helpers but also resolve a non-trivial issue of using non-atomic find_bit() together with atomic test_and_{set,clear)_bit(). The trick is that find_bit() implies that the bitmap is a regular non-volatile piece of memory, and compiler is allowed to use such optimization techniques like re-fetching memory instead of caching it. For example, find_first_bit() is implemented like this: for (idx = 0; idx * BITS_PER_LONG < sz; idx++) { val = addr[idx]; if (val) { sz = min(idx * BITS_PER_LONG + __ffs(val), sz); break; } } On register-memory architectures, like x86, compiler may decide to access memory twice - first time to compare against 0, and second time to fetch its value to pass it to __ffs(). When running find_first_bit() on volatile memory, the memory may get changed in-between, and for instance, it may lead to passing 0 to __ffs(), which is undefined. This is a potentially dangerous call. find_and_clear_bit() as a wrapper around test_and_clear_bit() naturally treats underlying bitmap as a volatile memory and prevents compiler from such optimizations. Now that KCSAN is catching exactly this type of situations and warns on undercover memory modifications. We can use it to reveal improper usage of find_bit(), and convert it to atomic find_and_*_bit() as appropriate. In some cases concurrent operations with plain find_bit() are acceptable. For example: - two threads running find_*_bit(): safe wrt ffs(0) and returns correct value, because underlying bitmap is unchanged; - find_next_bit() in parallel with set or clear_bit(), when modifying a bit prior to the start bit to search: safe and correct; - find_first_bit() in parallel with set_bit(): safe, but may return wrong bit number; - find_first_zero_bit() in parallel with clear_bit(): same as above. In last 2 cases find_bit() may not return a correct bit number, but it may be OK if caller requires any (not exactly the first) set or clear bit, correspondingly. In such cases, KCSAN may be safely silenced with data_race(). But in most cases where KCSAN detects concurrency people should carefully review their code and likely protect critical sections or switch to atomic find_and_bit(), as appropriate. The 1st patch of the series adds the following atomic primitives: find_and_set_bit(addr, nbits); find_and_set_next_bit(addr, nbits, start); ... Here find_and_{set,clear} part refers to the corresponding test_and_{set,clear}_bit function. Suffixes like _wrap or _lock derive their semantics from corresponding find() or test() functions. For brevity, the naming omits the fact that we search for zero bit in find_and_set, and correspondingly search for set bit in find_and_clear functions. The patch also adds iterators with atomic semantics, like for_each_test_and_set_bit(). Here, the naming rule is to simply prefix corresponding atomic operation with 'for_each'. CC: Bart Van Assche CC: Sergey Shtylyov Signed-off-by: Yury Norov --- include/linux/find.h | 293 +++++++++++++++++++++++++++++++++++++++++++ lib/find_bit.c | 85 +++++++++++++ 2 files changed, 378 insertions(+) diff --git a/include/linux/find.h b/include/linux/find.h index c69598e383c161..50eeeed5d8a34b 100644 --- a/include/linux/find.h +++ b/include/linux/find.h @@ -32,6 +32,16 @@ extern unsigned long _find_first_and_bit(const unsigned long *addr1, extern unsigned long _find_first_zero_bit(const unsigned long *addr, unsigned long size); extern unsigned long _find_last_bit(const unsigned long *addr, unsigned long size); +unsigned long _find_and_set_bit(volatile unsigned long *addr, unsigned long nbits); +unsigned long _find_and_set_next_bit(volatile unsigned long *addr, unsigned long nbits, + unsigned long start); +unsigned long _find_and_set_bit_lock(volatile unsigned long *addr, unsigned long nbits); +unsigned long _find_and_set_next_bit_lock(volatile unsigned long *addr, unsigned long nbits, + unsigned long start); +unsigned long _find_and_clear_bit(volatile unsigned long *addr, unsigned long nbits); +unsigned long _find_and_clear_next_bit(volatile unsigned long *addr, unsigned long nbits, + unsigned long start); + #ifdef __BIG_ENDIAN unsigned long _find_first_zero_bit_le(const unsigned long *addr, unsigned long size); unsigned long _find_next_zero_bit_le(const unsigned long *addr, unsigned @@ -460,6 +470,267 @@ unsigned long __for_each_wrap(const unsigned long *bitmap, unsigned long size, return bit < start ? bit : size; } +/** + * find_and_set_bit - Find a zero bit and set it atomically + * @addr: The address to base the search on + * @nbits: The bitmap size in bits + * + * This function is designed to operate in concurrent access environment. + * + * Because of concurrency and volatile nature of underlying bitmap, it's not + * guaranteed that the found bit is the 1st bit in the bitmap. It's also not + * guaranteed that if @nbits is returned, the bitmap is empty. + * + * The function does guarantee that if returned value is in range [0 .. @nbits), + * the acquired bit belongs to the caller exclusively. + * + * Returns: found and set bit, or @nbits if no bits found + */ +static inline +unsigned long find_and_set_bit(volatile unsigned long *addr, unsigned long nbits) +{ + if (small_const_nbits(nbits)) { + unsigned long val, ret; + + do { + val = *addr | ~GENMASK(nbits - 1, 0); + if (val == ~0UL) + return nbits; + ret = ffz(val); + } while (test_and_set_bit(ret, addr)); + + return ret; + } + + return _find_and_set_bit(addr, nbits); +} + + +/** + * find_and_set_next_bit - Find a zero bit and set it, starting from @offset + * @addr: The address to base the search on + * @nbits: The bitmap nbits in bits + * @offset: The bitnumber to start searching at + * + * This function is designed to operate in concurrent access environment. + * + * Because of concurrency and volatile nature of underlying bitmap, it's not + * guaranteed that the found bit is the 1st bit in the bitmap, starting from @offset. + * It's also not guaranteed that if @nbits is returned, the bitmap is empty. + * + * The function does guarantee that if returned value is in range [@offset .. @nbits), + * the acquired bit belongs to the caller exclusively. + * + * Returns: found and set bit, or @nbits if no bits found + */ +static inline +unsigned long find_and_set_next_bit(volatile unsigned long *addr, + unsigned long nbits, unsigned long offset) +{ + if (small_const_nbits(nbits)) { + unsigned long val, ret; + + do { + val = *addr | ~GENMASK(nbits - 1, offset); + if (val == ~0UL) + return nbits; + ret = ffz(val); + } while (test_and_set_bit(ret, addr)); + + return ret; + } + + return _find_and_set_next_bit(addr, nbits, offset); +} + +/** + * find_and_set_bit_wrap - find and set bit starting at @offset, wrapping around zero + * @addr: The first address to base the search on + * @nbits: The bitmap size in bits + * @offset: The bitnumber to start searching at + * + * Returns: the bit number for the next clear bit, or first clear bit up to @offset, + * while atomically setting it. If no bits are found, returns @nbits. + */ +static inline +unsigned long find_and_set_bit_wrap(volatile unsigned long *addr, + unsigned long nbits, unsigned long offset) +{ + unsigned long bit = find_and_set_next_bit(addr, nbits, offset); + + if (bit < nbits || offset == 0) + return bit; + + bit = find_and_set_bit(addr, offset); + return bit < offset ? bit : nbits; +} + +/** + * find_and_set_bit_lock - find a zero bit, then set it atomically with lock + * @addr: The address to base the search on + * @nbits: The bitmap nbits in bits + * + * This function is designed to operate in concurrent access environment. + * + * Because of concurrency and volatile nature of underlying bitmap, it's not + * guaranteed that the found bit is the 1st bit in the bitmap. It's also not + * guaranteed that if @nbits is returned, the bitmap is empty. + * + * The function does guarantee that if returned value is in range [0 .. @nbits), + * the acquired bit belongs to the caller exclusively. + * + * Returns: found and set bit, or @nbits if no bits found + */ +static inline +unsigned long find_and_set_bit_lock(volatile unsigned long *addr, unsigned long nbits) +{ + if (small_const_nbits(nbits)) { + unsigned long val, ret; + + do { + val = *addr | ~GENMASK(nbits - 1, 0); + if (val == ~0UL) + return nbits; + ret = ffz(val); + } while (test_and_set_bit_lock(ret, addr)); + + return ret; + } + + return _find_and_set_bit_lock(addr, nbits); +} + +/** + * find_and_set_next_bit_lock - find a zero bit and set it atomically with lock + * @addr: The address to base the search on + * @nbits: The bitmap size in bits + * @offset: The bitnumber to start searching at + * + * This function is designed to operate in concurrent access environment. + * + * Because of concurrency and volatile nature of underlying bitmap, it's not + * guaranteed that the found bit is the 1st bit in the range. It's also not + * guaranteed that if @nbits is returned, the bitmap is empty. + * + * The function does guarantee that if returned value is in range [@offset .. @nbits), + * the acquired bit belongs to the caller exclusively. + * + * Returns: found and set bit, or @nbits if no bits found + */ +static inline +unsigned long find_and_set_next_bit_lock(volatile unsigned long *addr, + unsigned long nbits, unsigned long offset) +{ + if (small_const_nbits(nbits)) { + unsigned long val, ret; + + do { + val = *addr | ~GENMASK(nbits - 1, offset); + if (val == ~0UL) + return nbits; + ret = ffz(val); + } while (test_and_set_bit_lock(ret, addr)); + + return ret; + } + + return _find_and_set_next_bit_lock(addr, nbits, offset); +} + +/** + * find_and_set_bit_wrap_lock - find zero bit starting at @ofset and set it + * with lock, and wrap around zero if nothing found + * @addr: The first address to base the search on + * @nbits: The bitmap size in bits + * @offset: The bitnumber to start searching at + * + * Returns: the bit number for the next set bit, or first set bit up to @offset + * If no bits are set, returns @nbits. + */ +static inline +unsigned long find_and_set_bit_wrap_lock(volatile unsigned long *addr, + unsigned long nbits, unsigned long offset) +{ + unsigned long bit = find_and_set_next_bit_lock(addr, nbits, offset); + + if (bit < nbits || offset == 0) + return bit; + + bit = find_and_set_bit_lock(addr, offset); + return bit < offset ? bit : nbits; +} + +/** + * find_and_clear_bit - Find a set bit and clear it atomically + * @addr: The address to base the search on + * @nbits: The bitmap nbits in bits + * + * This function is designed to operate in concurrent access environment. + * + * Because of concurrency and volatile nature of underlying bitmap, it's not + * guaranteed that the found bit is the 1st bit in the bitmap. It's also not + * guaranteed that if @nbits is returned, the bitmap is empty. + * + * The function does guarantee that if returned value is in range [0 .. @nbits), + * the acquired bit belongs to the caller exclusively. + * + * Returns: found and cleared bit, or @nbits if no bits found + */ +static inline unsigned long find_and_clear_bit(volatile unsigned long *addr, unsigned long nbits) +{ + if (small_const_nbits(nbits)) { + unsigned long val, ret; + + do { + val = *addr & GENMASK(nbits - 1, 0); + if (val == 0) + return nbits; + ret = __ffs(val); + } while (!test_and_clear_bit(ret, addr)); + + return ret; + } + + return _find_and_clear_bit(addr, nbits); +} + +/** + * find_and_clear_next_bit - Find a set bit next after @offset, and clear it atomically + * @addr: The address to base the search on + * @nbits: The bitmap nbits in bits + * @offset: bit offset at which to start searching + * + * This function is designed to operate in concurrent access environment. + * + * Because of concurrency and volatile nature of underlying bitmap, it's not + * guaranteed that the found bit is the 1st bit in the range It's also not + * guaranteed that if @nbits is returned, there's no set bits after @offset. + * + * The function does guarantee that if returned value is in range [@offset .. @nbits), + * the acquired bit belongs to the caller exclusively. + * + * Returns: found and cleared bit, or @nbits if no bits found + */ +static inline +unsigned long find_and_clear_next_bit(volatile unsigned long *addr, + unsigned long nbits, unsigned long offset) +{ + if (small_const_nbits(nbits)) { + unsigned long val, ret; + + do { + val = *addr & GENMASK(nbits - 1, offset); + if (val == 0) + return nbits; + ret = __ffs(val); + } while (!test_and_clear_bit(ret, addr)); + + return ret; + } + + return _find_and_clear_next_bit(addr, nbits, offset); +} + /** * find_next_clump8 - find next 8-bit clump with set bits in a memory region * @clump: location to store copy of found clump @@ -577,6 +848,28 @@ unsigned long find_next_bit_le(const void *addr, unsigned #define for_each_set_bit_from(bit, addr, size) \ for (; (bit) = find_next_bit((addr), (size), (bit)), (bit) < (size); (bit)++) +/* same as for_each_set_bit() but atomically clears each found bit */ +#define for_each_test_and_clear_bit(bit, addr, size) \ + for ((bit) = 0; \ + (bit) = find_and_clear_next_bit((addr), (size), (bit)), (bit) < (size); \ + (bit)++) + +/* same as for_each_set_bit_from() but atomically clears each found bit */ +#define for_each_test_and_clear_bit_from(bit, addr, size) \ + for (; (bit) = find_and_clear_next_bit((addr), (size), (bit)), (bit) < (size); (bit)++) + +/* same as for_each_clear_bit() but atomically sets each found bit */ +#define for_each_test_and_set_bit(bit, addr, size) \ + for ((bit) = 0; \ + (bit) = find_and_set_next_bit((addr), (size), (bit)), (bit) < (size); \ + (bit)++) + +/* same as for_each_clear_bit_from() but atomically clears each found bit */ +#define for_each_test_and_set_bit_from(bit, addr, size) \ + for (; \ + (bit) = find_and_set_next_bit((addr), (size), (bit)), (bit) < (size); \ + (bit)++) + #define for_each_clear_bit(bit, addr, size) \ for ((bit) = 0; \ (bit) = find_next_zero_bit((addr), (size), (bit)), (bit) < (size); \ diff --git a/lib/find_bit.c b/lib/find_bit.c index 32f99e9a670e64..c9b6b9f966108f 100644 --- a/lib/find_bit.c +++ b/lib/find_bit.c @@ -116,6 +116,91 @@ unsigned long _find_first_and_bit(const unsigned long *addr1, EXPORT_SYMBOL(_find_first_and_bit); #endif +unsigned long _find_and_set_bit(volatile unsigned long *addr, unsigned long nbits) +{ + unsigned long bit; + + do { + bit = FIND_FIRST_BIT(~addr[idx], /* nop */, nbits); + if (bit >= nbits) + return nbits; + } while (test_and_set_bit(bit, addr)); + + return bit; +} +EXPORT_SYMBOL(_find_and_set_bit); + +unsigned long _find_and_set_next_bit(volatile unsigned long *addr, + unsigned long nbits, unsigned long start) +{ + unsigned long bit; + + do { + bit = FIND_NEXT_BIT(~addr[idx], /* nop */, nbits, start); + if (bit >= nbits) + return nbits; + } while (test_and_set_bit(bit, addr)); + + return bit; +} +EXPORT_SYMBOL(_find_and_set_next_bit); + +unsigned long _find_and_set_bit_lock(volatile unsigned long *addr, unsigned long nbits) +{ + unsigned long bit; + + do { + bit = FIND_FIRST_BIT(~addr[idx], /* nop */, nbits); + if (bit >= nbits) + return nbits; + } while (test_and_set_bit_lock(bit, addr)); + + return bit; +} +EXPORT_SYMBOL(_find_and_set_bit_lock); + +unsigned long _find_and_set_next_bit_lock(volatile unsigned long *addr, + unsigned long nbits, unsigned long start) +{ + unsigned long bit; + + do { + bit = FIND_NEXT_BIT(~addr[idx], /* nop */, nbits, start); + if (bit >= nbits) + return nbits; + } while (test_and_set_bit_lock(bit, addr)); + + return bit; +} +EXPORT_SYMBOL(_find_and_set_next_bit_lock); + +unsigned long _find_and_clear_bit(volatile unsigned long *addr, unsigned long nbits) +{ + unsigned long bit; + + do { + bit = FIND_FIRST_BIT(addr[idx], /* nop */, nbits); + if (bit >= nbits) + return nbits; + } while (!test_and_clear_bit(bit, addr)); + + return bit; +} +EXPORT_SYMBOL(_find_and_clear_bit); + +unsigned long _find_and_clear_next_bit(volatile unsigned long *addr, + unsigned long nbits, unsigned long start) +{ + do { + start = FIND_NEXT_BIT(addr[idx], /* nop */, nbits, start); + if (start >= nbits) + return nbits; + } while (!test_and_clear_bit(start, addr)); + + return start; +} +EXPORT_SYMBOL(_find_and_clear_next_bit); + #ifndef find_first_zero_bit /* * Find the first cleared bit in a memory region. From 9297e20670743257f6c3fe7ebd6be5802b1dc8c7 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Mon, 11 Dec 2023 18:27:16 -0800 Subject: [PATCH 037/707] lib/find: add test for atomic find_bit() ops Add basic functionality test for new API. Signed-off-by: Yury Norov --- lib/test_bitmap.c | 61 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/lib/test_bitmap.c b/lib/test_bitmap.c index 65f22c2578b066..277e1ca9fd2847 100644 --- a/lib/test_bitmap.c +++ b/lib/test_bitmap.c @@ -221,6 +221,65 @@ static void __init test_zero_clear(void) expect_eq_pbl("", bmap, 1024); } +static void __init test_find_and_bit(void) +{ + unsigned long w, w_part, bit, cnt = 0; + DECLARE_BITMAP(bmap, EXP1_IN_BITS); + + /* + * Test find_and_clear{_next}_bit() and corresponding + * iterators + */ + bitmap_copy(bmap, exp1, EXP1_IN_BITS); + w = bitmap_weight(bmap, EXP1_IN_BITS); + + for_each_test_and_clear_bit(bit, bmap, EXP1_IN_BITS) + cnt++; + + expect_eq_uint(w, cnt); + expect_eq_uint(0, bitmap_weight(bmap, EXP1_IN_BITS)); + + bitmap_copy(bmap, exp1, EXP1_IN_BITS); + w = bitmap_weight(bmap, EXP1_IN_BITS); + w_part = bitmap_weight(bmap, EXP1_IN_BITS / 3); + + cnt = 0; + bit = EXP1_IN_BITS / 3; + for_each_test_and_clear_bit_from(bit, bmap, EXP1_IN_BITS) + cnt++; + + expect_eq_uint(bitmap_weight(bmap, EXP1_IN_BITS), bitmap_weight(bmap, EXP1_IN_BITS / 3)); + expect_eq_uint(w_part, bitmap_weight(bmap, EXP1_IN_BITS)); + expect_eq_uint(w - w_part, cnt); + + /* + * Test find_and_set{_next}_bit() and corresponding + * iterators + */ + bitmap_copy(bmap, exp1, EXP1_IN_BITS); + w = bitmap_weight(bmap, EXP1_IN_BITS); + cnt = 0; + + for_each_test_and_set_bit(bit, bmap, EXP1_IN_BITS) + cnt++; + + expect_eq_uint(EXP1_IN_BITS - w, cnt); + expect_eq_uint(EXP1_IN_BITS, bitmap_weight(bmap, EXP1_IN_BITS)); + + bitmap_copy(bmap, exp1, EXP1_IN_BITS); + w = bitmap_weight(bmap, EXP1_IN_BITS); + w_part = bitmap_weight(bmap, EXP1_IN_BITS / 3); + cnt = 0; + + bit = EXP1_IN_BITS / 3; + for_each_test_and_set_bit_from(bit, bmap, EXP1_IN_BITS) + cnt++; + + expect_eq_uint(EXP1_IN_BITS - bitmap_weight(bmap, EXP1_IN_BITS), + EXP1_IN_BITS / 3 - bitmap_weight(bmap, EXP1_IN_BITS / 3)); + expect_eq_uint(EXP1_IN_BITS * 2 / 3 - (w - w_part), cnt); +} + static void __init test_find_nth_bit(void) { unsigned long b, bit, cnt = 0; @@ -1273,6 +1332,8 @@ static void __init selftest(void) test_for_each_clear_bitrange_from(); test_for_each_set_clump8(); test_for_each_set_bit_wrap(); + + test_find_and_bit(); } KSTM_MODULE_LOADERS(test_bitmap); From 0af7b0df61f906c57568b8730df70058820a7613 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Mon, 11 Dec 2023 18:27:17 -0800 Subject: [PATCH 038/707] lib/sbitmap; optimize __sbitmap_get_word() by using find_and_set_bit() __sbitmap_get_word() opencodes either find_and_set_bit_wrap(), or find_and_set_next_bit() depending on wrap parameter. Simplify it by using atomic find_bit() API. While here, simplify sbitmap_find_bit_in_word(), which calls it. CC: Jens Axboe Signed-off-by: Yury Norov Reviewed-by: Jan Kara --- lib/sbitmap.c | 46 +++++++++------------------------------------- 1 file changed, 9 insertions(+), 37 deletions(-) diff --git a/lib/sbitmap.c b/lib/sbitmap.c index d0a5081dfd122e..8ecd830ba9e896 100644 --- a/lib/sbitmap.c +++ b/lib/sbitmap.c @@ -133,38 +133,13 @@ void sbitmap_resize(struct sbitmap *sb, unsigned int depth) } EXPORT_SYMBOL_GPL(sbitmap_resize); -static int __sbitmap_get_word(unsigned long *word, unsigned long depth, +static inline int __sbitmap_get_word(unsigned long *word, unsigned long depth, unsigned int hint, bool wrap) { - int nr; - - /* don't wrap if starting from 0 */ - wrap = wrap && hint; - - while (1) { - nr = find_next_zero_bit(word, depth, hint); - if (unlikely(nr >= depth)) { - /* - * We started with an offset, and we didn't reset the - * offset to 0 in a failure case, so start from 0 to - * exhaust the map. - */ - if (hint && wrap) { - hint = 0; - continue; - } - return -1; - } + if (wrap) + return find_and_set_bit_wrap_lock(word, depth, hint); - if (!test_and_set_bit_lock(nr, word)) - break; - - hint = nr + 1; - if (hint >= depth - 1) - hint = 0; - } - - return nr; + return find_and_set_next_bit_lock(word, depth, hint); } static int sbitmap_find_bit_in_word(struct sbitmap_word *map, @@ -175,15 +150,12 @@ static int sbitmap_find_bit_in_word(struct sbitmap_word *map, int nr; do { - nr = __sbitmap_get_word(&map->word, depth, - alloc_hint, wrap); - if (nr != -1) - break; - if (!sbitmap_deferred_clear(map)) - break; - } while (1); + nr = __sbitmap_get_word(&map->word, depth, alloc_hint, wrap); + if (nr < depth) + return nr; + } while (sbitmap_deferred_clear(map)); - return nr; + return -1; } static int sbitmap_find_bit(struct sbitmap *sb, From fc3bdc592a724edeb6546f39f168067571007d7a Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Mon, 11 Dec 2023 18:27:18 -0800 Subject: [PATCH 039/707] watch_queue: optimize post_one_notification() by using find_and_clear_bit() post_one_notification() searches for a set bit in wqueue->notes_bitmap, and after some housekeeping work clears it, firing a BUG() if someone else cleared the bit in-between. We can allocate the bit atomically with an atomic find_and_clear_bit(), and remove the BUG() possibility entirely. Signed-off-by: Yury Norov --- kernel/watch_queue.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c index 778b4056700ff5..07edd4a2b4636b 100644 --- a/kernel/watch_queue.c +++ b/kernel/watch_queue.c @@ -112,7 +112,7 @@ static bool post_one_notification(struct watch_queue *wqueue, if (pipe_full(head, tail, pipe->ring_size)) goto lost; - note = find_first_bit(wqueue->notes_bitmap, wqueue->nr_notes); + note = find_and_clear_bit(wqueue->notes_bitmap, wqueue->nr_notes); if (note >= wqueue->nr_notes) goto lost; @@ -133,10 +133,6 @@ static bool post_one_notification(struct watch_queue *wqueue, buf->flags = PIPE_BUF_FLAG_WHOLE; smp_store_release(&pipe->head, head + 1); /* vs pipe_read() */ - if (!test_and_clear_bit(note, wqueue->notes_bitmap)) { - spin_unlock_irq(&pipe->rd_wait.lock); - BUG(); - } wake_up_interruptible_sync_poll_locked(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM); done = true; From cd6c08c6647d524d71f51428b78fe72590d42c16 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Mon, 11 Dec 2023 18:27:19 -0800 Subject: [PATCH 040/707] sched: add cpumask_find_and_set() and use it in __mm_cid_get() __mm_cid_get() uses __mm_cid_try_get() helper to atomically acquire a bit in mm cid mask. Now that we have atomic find_and_set_bit(), we can easily extend it to cpumasks and use in the scheduler code. cpumask_find_and_set() considers cid mask as a volatile region of memory, as it actually is in this case. So, if it's changed while search is in progress, KCSAN wouldn't fire warning on it. CC: Peter Zijlstra Signed-off-by: Yury Norov Reviewed-by: Mathieu Desnoyers --- include/linux/cpumask.h | 12 ++++++++++++ kernel/sched/sched.h | 14 +++++--------- 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index cfb545841a2c74..c2acced8be4ece 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -271,6 +271,18 @@ unsigned int cpumask_next_and(int n, const struct cpumask *src1p, small_cpumask_bits, n + 1); } +/** + * cpumask_find_and_set - find the first unset cpu in a cpumask and + * set it atomically + * @srcp: the cpumask pointer + * + * Return: >= nr_cpu_ids if nothing is found. + */ +static inline unsigned int cpumask_find_and_set(volatile struct cpumask *srcp) +{ + return find_and_set_bit(cpumask_bits(srcp), small_cpumask_bits); +} + /** * for_each_cpu - iterate over every cpu in a mask * @cpu: the (optionally unsigned) integer iterator diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 2e5a95486a4222..2ce9112de89be1 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -3347,23 +3347,19 @@ static inline void mm_cid_put(struct mm_struct *mm) static inline int __mm_cid_try_get(struct mm_struct *mm) { - struct cpumask *cpumask; - int cid; + struct cpumask *cpumask = mm_cidmask(mm); + int cid = nr_cpu_ids; - cpumask = mm_cidmask(mm); /* * Retry finding first zero bit if the mask is temporarily * filled. This only happens during concurrent remote-clear * which owns a cid without holding a rq lock. */ - for (;;) { - cid = cpumask_first_zero(cpumask); - if (cid < nr_cpu_ids) - break; + while (cid >= nr_cpu_ids) { + cid = cpumask_find_and_set(cpumask); cpu_relax(); } - if (cpumask_test_and_set_cpu(cid, cpumask)) - return -1; + return cid; } From 991411e2febc2f472d7ace069bb371d1bf2f6df4 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Mon, 11 Dec 2023 18:27:20 -0800 Subject: [PATCH 041/707] mips: sgi-ip30: optimize heart_alloc_int() by using find_and_set_bit() heart_alloc_int() opencodes find_and_set_bit(). Simplify it by using the dedicated function, and make an nice one-liner. Signed-off-by: Yury Norov --- arch/mips/sgi-ip30/ip30-irq.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/arch/mips/sgi-ip30/ip30-irq.c b/arch/mips/sgi-ip30/ip30-irq.c index 423c32cb66ed52..3c4d4e947817fb 100644 --- a/arch/mips/sgi-ip30/ip30-irq.c +++ b/arch/mips/sgi-ip30/ip30-irq.c @@ -28,17 +28,9 @@ static DEFINE_PER_CPU(unsigned long, irq_enable_mask); static inline int heart_alloc_int(void) { - int bit; + int bit = find_and_set_bit(heart_irq_map, HEART_NUM_IRQS); -again: - bit = find_first_zero_bit(heart_irq_map, HEART_NUM_IRQS); - if (bit >= HEART_NUM_IRQS) - return -ENOSPC; - - if (test_and_set_bit(bit, heart_irq_map)) - goto again; - - return bit; + return bit < HEART_NUM_IRQS ? bit : -ENOSPC; } static void ip30_error_irq(struct irq_desc *desc) From 448a89c116ca108606670621012cc5424016410f Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Mon, 11 Dec 2023 18:27:21 -0800 Subject: [PATCH 042/707] sparc: optimize alloc_msi() by using find_and_set_bit() alloc_msi() opencodes find_and_set_bit(). Simplify it by using the dedicated function, and make an nice one-liner. Signed-off-by: Yury Norov --- arch/sparc/kernel/pci_msi.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/arch/sparc/kernel/pci_msi.c b/arch/sparc/kernel/pci_msi.c index fc7402948b7bc0..91105c788d1d9d 100644 --- a/arch/sparc/kernel/pci_msi.c +++ b/arch/sparc/kernel/pci_msi.c @@ -96,14 +96,9 @@ static u32 pick_msiq(struct pci_pbm_info *pbm) static int alloc_msi(struct pci_pbm_info *pbm) { - int i; - - for (i = 0; i < pbm->msi_num; i++) { - if (!test_and_set_bit(i, pbm->msi_bitmap)) - return i + pbm->msi_first; - } + int i = find_and_set_bit(pbm->msi_bitmap, pbm->msi_num); - return -ENOENT; + return i < pbm->msi_num ? i + pbm->msi_first : -ENOENT; } static void free_msi(struct pci_pbm_info *pbm, int msi_num) From e905d8a7d76b2b45f70ff0c8584d2ce99ee4c387 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Mon, 11 Dec 2023 18:27:22 -0800 Subject: [PATCH 043/707] perf/arm: use atomic find_bit() API Simplify subsystem by use atomic find_bit() or atomic API where applicable. CC: Will Deacon Signed-off-by: Yury Norov --- drivers/perf/arm-cci.c | 24 ++++++------------------ drivers/perf/arm-ccn.c | 10 ++-------- drivers/perf/arm_dmc620_pmu.c | 9 ++------- drivers/perf/arm_pmuv3.c | 8 ++------ 4 files changed, 12 insertions(+), 39 deletions(-) diff --git a/drivers/perf/arm-cci.c b/drivers/perf/arm-cci.c index 61de861eaf91e3..cb15b4cee5f75e 100644 --- a/drivers/perf/arm-cci.c +++ b/drivers/perf/arm-cci.c @@ -320,12 +320,9 @@ static int cci400_get_event_idx(struct cci_pmu *cci_pmu, return CCI400_PMU_CYCLE_CNTR_IDX; } - for (idx = CCI400_PMU_CNTR0_IDX; idx <= CCI_PMU_CNTR_LAST(cci_pmu); ++idx) - if (!test_and_set_bit(idx, hw->used_mask)) - return idx; - - /* No counters available */ - return -EAGAIN; + idx = find_and_set_next_bit(hw->used_mask, CCI_PMU_CNTR_LAST(cci_pmu) + 1, + CCI400_PMU_CNTR0_IDX); + return idx < CCI_PMU_CNTR_LAST(cci_pmu) + 1 ? idx : -EAGAIN; } static int cci400_validate_hw_event(struct cci_pmu *cci_pmu, unsigned long hw_event) @@ -802,13 +799,8 @@ static int pmu_get_event_idx(struct cci_pmu_hw_events *hw, struct perf_event *ev if (cci_pmu->model->get_event_idx) return cci_pmu->model->get_event_idx(cci_pmu, hw, cci_event); - /* Generic code to find an unused idx from the mask */ - for (idx = 0; idx <= CCI_PMU_CNTR_LAST(cci_pmu); idx++) - if (!test_and_set_bit(idx, hw->used_mask)) - return idx; - - /* No counters available */ - return -EAGAIN; + idx = find_and_set_bit(hw->used_mask, CCI_PMU_CNTR_LAST(cci_pmu) + 1); + return idx < CCI_PMU_CNTR_LAST(cci_pmu) + 1 ? idx : -EAGAIN; } static int pmu_map_event(struct perf_event *event) @@ -861,12 +853,8 @@ static void pmu_free_irq(struct cci_pmu *cci_pmu) { int i; - for (i = 0; i < cci_pmu->nr_irqs; i++) { - if (!test_and_clear_bit(i, &cci_pmu->active_irqs)) - continue; - + for_each_test_and_clear_bit(i, &cci_pmu->active_irqs, cci_pmu->nr_irqs) free_irq(cci_pmu->irqs[i], cci_pmu); - } } static u32 pmu_read_counter(struct perf_event *event) diff --git a/drivers/perf/arm-ccn.c b/drivers/perf/arm-ccn.c index 728d13d8e98ac9..d657701b1f236c 100644 --- a/drivers/perf/arm-ccn.c +++ b/drivers/perf/arm-ccn.c @@ -589,15 +589,9 @@ static const struct attribute_group *arm_ccn_pmu_attr_groups[] = { static int arm_ccn_pmu_alloc_bit(unsigned long *bitmap, unsigned long size) { - int bit; - - do { - bit = find_first_zero_bit(bitmap, size); - if (bit >= size) - return -EAGAIN; - } while (test_and_set_bit(bit, bitmap)); + int bit = find_and_set_bit(bitmap, size); - return bit; + return bit < size ? bit : -EAGAIN; } /* All RN-I and RN-D nodes have identical PMUs */ diff --git a/drivers/perf/arm_dmc620_pmu.c b/drivers/perf/arm_dmc620_pmu.c index 30cea685957470..e41c84dabc3ebe 100644 --- a/drivers/perf/arm_dmc620_pmu.c +++ b/drivers/perf/arm_dmc620_pmu.c @@ -303,13 +303,8 @@ static int dmc620_get_event_idx(struct perf_event *event) end_idx = DMC620_PMU_MAX_COUNTERS; } - for (idx = start_idx; idx < end_idx; ++idx) { - if (!test_and_set_bit(idx, dmc620_pmu->used_mask)) - return idx; - } - - /* The counters are all in use. */ - return -EAGAIN; + idx = find_and_set_next_bit(dmc620_pmu->used_mask, end_idx, start_idx); + return idx < end_idx ? idx : -EAGAIN; } static inline diff --git a/drivers/perf/arm_pmuv3.c b/drivers/perf/arm_pmuv3.c index 6ca7be05229c10..f046ad9e71f1aa 100644 --- a/drivers/perf/arm_pmuv3.c +++ b/drivers/perf/arm_pmuv3.c @@ -825,13 +825,9 @@ static irqreturn_t armv8pmu_handle_irq(struct arm_pmu *cpu_pmu) static int armv8pmu_get_single_idx(struct pmu_hw_events *cpuc, struct arm_pmu *cpu_pmu) { - int idx; + int idx = find_and_set_next_bit(cpuc->used_mask, cpu_pmu->num_events, ARMV8_IDX_COUNTER0); - for (idx = ARMV8_IDX_COUNTER0; idx < cpu_pmu->num_events; idx++) { - if (!test_and_set_bit(idx, cpuc->used_mask)) - return idx; - } - return -EAGAIN; + return idx < cpu_pmu->num_events ? idx : -EAGAIN; } static int armv8pmu_get_chain_idx(struct pmu_hw_events *cpuc, From f5f61c6f9f2771aeabdb796302a378e50080d6bf Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Mon, 11 Dec 2023 18:27:23 -0800 Subject: [PATCH 044/707] drivers/perf: optimize ali_drw_get_counter_idx() by using find_and_set_bit() The function searches used_mask for a set bit in a for-loop bit by bit. Simplify it by using atomic find_and_set_bit(). Signed-off-by: Yury Norov Acked-by: Will Deacon --- drivers/perf/alibaba_uncore_drw_pmu.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/drivers/perf/alibaba_uncore_drw_pmu.c b/drivers/perf/alibaba_uncore_drw_pmu.c index 19d459a36be55c..2a3b7701d568bd 100644 --- a/drivers/perf/alibaba_uncore_drw_pmu.c +++ b/drivers/perf/alibaba_uncore_drw_pmu.c @@ -274,15 +274,9 @@ static const struct attribute_group *ali_drw_pmu_attr_groups[] = { static int ali_drw_get_counter_idx(struct perf_event *event) { struct ali_drw_pmu *drw_pmu = to_ali_drw_pmu(event->pmu); - int idx; + int idx = find_and_set_bit(drw_pmu->used_mask, ALI_DRW_PMU_COMMON_MAX_COUNTERS); - for (idx = 0; idx < ALI_DRW_PMU_COMMON_MAX_COUNTERS; ++idx) { - if (!test_and_set_bit(idx, drw_pmu->used_mask)) - return idx; - } - - /* The counters are all in use. */ - return -EBUSY; + return idx < ALI_DRW_PMU_COMMON_MAX_COUNTERS ? idx : -EBUSY; } static u64 ali_drw_pmu_read_counter(struct perf_event *event) From 37cd1b38270a3eb1555b533ad597252f5bfd9ecc Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Mon, 11 Dec 2023 18:27:24 -0800 Subject: [PATCH 045/707] dmaengine: idxd: optimize perfmon_assign_event() The function searches used_mask for a set bit in a for-loop bit by bit. Simplify it by using atomic find_and_set_bit(), and make a nice one-liner. Signed-off-by: Yury Norov Reviewed-by: Dave Jiang Acked-by: Vinod Koul Reviewed-by: Fenghua Yu --- drivers/dma/idxd/perfmon.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/dma/idxd/perfmon.c b/drivers/dma/idxd/perfmon.c index fdda6d60426295..4dd9c0d979c388 100644 --- a/drivers/dma/idxd/perfmon.c +++ b/drivers/dma/idxd/perfmon.c @@ -134,13 +134,9 @@ static void perfmon_assign_hw_event(struct idxd_pmu *idxd_pmu, static int perfmon_assign_event(struct idxd_pmu *idxd_pmu, struct perf_event *event) { - int i; - - for (i = 0; i < IDXD_PMU_EVENT_MAX; i++) - if (!test_and_set_bit(i, idxd_pmu->used_mask)) - return i; + int i = find_and_set_bit(idxd_pmu->used_mask, IDXD_PMU_EVENT_MAX); - return -EINVAL; + return i < IDXD_PMU_EVENT_MAX ? i : -EINVAL; } /* From 10922d08df496bc8e0963ce810936f30ad56c81c Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Mon, 11 Dec 2023 18:27:25 -0800 Subject: [PATCH 046/707] ath10k: optimize ath10k_snoc_napi_poll() ath10k_snoc_napi_poll() traverses pending_ce_irqs bitmap bit by bit. Simplify it by using for_each_test_and_clear_bit() iterator. Signed-off-by: Yury Norov --- drivers/net/wireless/ath/ath10k/snoc.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/net/wireless/ath/ath10k/snoc.c b/drivers/net/wireless/ath/ath10k/snoc.c index 2c39bad7ebfb9a..a1db5a973780c4 100644 --- a/drivers/net/wireless/ath/ath10k/snoc.c +++ b/drivers/net/wireless/ath/ath10k/snoc.c @@ -1237,11 +1237,10 @@ static int ath10k_snoc_napi_poll(struct napi_struct *ctx, int budget) return done; } - for (ce_id = 0; ce_id < CE_COUNT; ce_id++) - if (test_and_clear_bit(ce_id, ar_snoc->pending_ce_irqs)) { - ath10k_ce_per_engine_service(ar, ce_id); - ath10k_ce_enable_interrupt(ar, ce_id); - } + for_each_test_and_clear_bit(ce_id, ar_snoc->pending_ce_irqs, CE_COUNT) { + ath10k_ce_per_engine_service(ar, ce_id); + ath10k_ce_enable_interrupt(ar, ce_id); + } done = ath10k_htt_txrx_compl_task(ar, budget); From f55f49707defc7939df686487535ac5702de9a67 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Mon, 11 Dec 2023 18:27:26 -0800 Subject: [PATCH 047/707] wifi: rtw88: optimize the driver by using atomic iterator rtw_pci_tx_kick_off() and rtw89_pci_tx_kick_off_pending() traverse bitmaps bit by bit. Simplify it by using atomic for_each_test_and_clear_bit() iterator. Signed-off-by: Yury Norov --- drivers/net/wireless/realtek/rtw88/pci.c | 5 ++--- drivers/net/wireless/realtek/rtw89/pci.c | 5 +---- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/drivers/net/wireless/realtek/rtw88/pci.c b/drivers/net/wireless/realtek/rtw88/pci.c index 2bfc0e822b8d0b..a0d69c75a38185 100644 --- a/drivers/net/wireless/realtek/rtw88/pci.c +++ b/drivers/net/wireless/realtek/rtw88/pci.c @@ -789,9 +789,8 @@ static void rtw_pci_tx_kick_off(struct rtw_dev *rtwdev) struct rtw_pci *rtwpci = (struct rtw_pci *)rtwdev->priv; enum rtw_tx_queue_type queue; - for (queue = 0; queue < RTK_MAX_TX_QUEUE_NUM; queue++) - if (test_and_clear_bit(queue, rtwpci->tx_queued)) - rtw_pci_tx_kick_off_queue(rtwdev, queue); + for_each_test_and_clear_bit(queue, rtwpci->tx_queued, RTK_MAX_TX_QUEUE_NUM) + rtw_pci_tx_kick_off_queue(rtwdev, queue); } static int rtw_pci_tx_write_data(struct rtw_dev *rtwdev, diff --git a/drivers/net/wireless/realtek/rtw89/pci.c b/drivers/net/wireless/realtek/rtw89/pci.c index 14ddb0d39e6374..184d41b774d7c3 100644 --- a/drivers/net/wireless/realtek/rtw89/pci.c +++ b/drivers/net/wireless/realtek/rtw89/pci.c @@ -1077,10 +1077,7 @@ static void rtw89_pci_tx_kick_off_pending(struct rtw89_dev *rtwdev) struct rtw89_pci_tx_ring *tx_ring; int txch; - for (txch = 0; txch < RTW89_TXCH_NUM; txch++) { - if (!test_and_clear_bit(txch, rtwpci->kick_map)) - continue; - + for_each_test_and_clear_bit(txch, rtwpci->kick_map, RTW89_TXCH_NUM) { tx_ring = &rtwpci->tx_rings[txch]; __rtw89_pci_tx_kick_off(rtwdev, tx_ring); } From 252479be16f72ecd6a0cf0ab88d79cac70eee826 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Mon, 11 Dec 2023 18:27:27 -0800 Subject: [PATCH 048/707] KVM: x86: hyper-v: optimize and cleanup kvm_hv_process_stimers() The function traverses stimer_pending_bitmap in a for-loop bit by bit. Simplify it by using atomic for_each_test_and_clear_bit(). Because there are only 4 bits, using for_each_test_and_clear_bit() will still generate inline code, so no excessive bloating with the new API. While here, refactor the logic by decreasing indentation level. CC: Sean Christopherson Signed-off-by: Yury Norov Reviewed-by: Vitaly Kuznetsov Acked-by: Sean Christopherson --- arch/x86/kvm/hyperv.c | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 238afd7335e46d..d541524ca49f74 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -870,27 +870,27 @@ void kvm_hv_process_stimers(struct kvm_vcpu *vcpu) if (!hv_vcpu) return; - for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++) - if (test_and_clear_bit(i, hv_vcpu->stimer_pending_bitmap)) { - stimer = &hv_vcpu->stimer[i]; - if (stimer->config.enable) { - exp_time = stimer->exp_time; - - if (exp_time) { - time_now = - get_time_ref_counter(vcpu->kvm); - if (time_now >= exp_time) - stimer_expiration(stimer); - } - - if ((stimer->config.enable) && - stimer->count) { - if (!stimer->msg_pending) - stimer_start(stimer); - } else - stimer_cleanup(stimer); - } + for_each_test_and_clear_bit(i, hv_vcpu->stimer_pending_bitmap, + ARRAY_SIZE(hv_vcpu->stimer)) { + stimer = &hv_vcpu->stimer[i]; + if (!stimer->config.enable) + continue; + + exp_time = stimer->exp_time; + + if (exp_time) { + time_now = get_time_ref_counter(vcpu->kvm); + if (time_now >= exp_time) + stimer_expiration(stimer); } + + if (stimer->config.enable && stimer->count) { + if (!stimer->msg_pending) + stimer_start(stimer); + } else { + stimer_cleanup(stimer); + } + } } void kvm_hv_vcpu_uninit(struct kvm_vcpu *vcpu) From 020c02d58ef080a486c05f0adf8dd146e2549a74 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Mon, 11 Dec 2023 18:27:28 -0800 Subject: [PATCH 049/707] PCI: hv: Optimize hv_get_dom_num() by using find_and_set_bit() The function traverses bitmap with for_each_clear_bit() just to allocate a bit atomically. Simplify it by using dedicated find_and_set_bit(). Signed-off-by: Yury Norov Reviewed-by: Michael Kelley Acked-by: Wei Liu Acked-by: Bjorn Helgaas --- drivers/pci/controller/pci-hyperv.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c index 30c7dfeccb16f5..033b1fb7f4eb44 100644 --- a/drivers/pci/controller/pci-hyperv.c +++ b/drivers/pci/controller/pci-hyperv.c @@ -3605,12 +3605,9 @@ static u16 hv_get_dom_num(u16 dom) if (test_and_set_bit(dom, hvpci_dom_map) == 0) return dom; - for_each_clear_bit(i, hvpci_dom_map, HVPCI_DOM_MAP_SIZE) { - if (test_and_set_bit(i, hvpci_dom_map) == 0) - return i; - } + i = find_and_set_bit(hvpci_dom_map, HVPCI_DOM_MAP_SIZE); - return HVPCI_DOM_INVALID; + return i < HVPCI_DOM_MAP_SIZE ? i : HVPCI_DOM_INVALID; } /** From 57dd83bdbe3c0b3a0e83b339cd777568a179e329 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Mon, 11 Dec 2023 18:27:29 -0800 Subject: [PATCH 050/707] scsi: core: optimize scsi_evt_emit() by using an atomic iterator A plain loop in scsi_evt_thread() opencodes optimized atomic bit traversing macro. Simplify it by using the dedicated iterator. CC: Bart Van Assche Signed-off-by: Yury Norov --- drivers/scsi/scsi_lib.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index cf3864f7209309..a4c5c9b4bfc94e 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -2494,14 +2494,13 @@ static void scsi_evt_emit(struct scsi_device *sdev, struct scsi_event *evt) void scsi_evt_thread(struct work_struct *work) { struct scsi_device *sdev; - enum scsi_device_event evt_type; + enum scsi_device_event evt_type = SDEV_EVT_FIRST; LIST_HEAD(event_list); sdev = container_of(work, struct scsi_device, event_work); - for (evt_type = SDEV_EVT_FIRST; evt_type <= SDEV_EVT_LAST; evt_type++) - if (test_and_clear_bit(evt_type, sdev->pending_events)) - sdev_evt_send_simple(sdev, evt_type, GFP_KERNEL); + for_each_test_and_clear_bit_from(evt_type, sdev->pending_events, SDEV_EVT_LAST + 1) + sdev_evt_send_simple(sdev, evt_type, GFP_KERNEL); while (1) { struct scsi_event *evt; From b0bfc29429fd4d989b0d7e6901c60e453d888e45 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Mon, 11 Dec 2023 18:27:30 -0800 Subject: [PATCH 051/707] scsi: mpi3mr: optimize the driver by using find_and_set_bit() mpi3mr_dev_rmhs_send_tm() and mpi3mr_send_event_ack() opencode find_and_set_bit(). Simplify them by using dedicated function. CC: Bart Van Assche Signed-off-by: Yury Norov --- drivers/scsi/mpi3mr/mpi3mr_os.c | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) diff --git a/drivers/scsi/mpi3mr/mpi3mr_os.c b/drivers/scsi/mpi3mr/mpi3mr_os.c index 040031eb0c12d4..11139a2008fdac 100644 --- a/drivers/scsi/mpi3mr/mpi3mr_os.c +++ b/drivers/scsi/mpi3mr/mpi3mr_os.c @@ -2276,13 +2276,9 @@ static void mpi3mr_dev_rmhs_send_tm(struct mpi3mr_ioc *mrioc, u16 handle, if (drv_cmd) goto issue_cmd; do { - cmd_idx = find_first_zero_bit(mrioc->devrem_bitmap, - MPI3MR_NUM_DEVRMCMD); - if (cmd_idx < MPI3MR_NUM_DEVRMCMD) { - if (!test_and_set_bit(cmd_idx, mrioc->devrem_bitmap)) - break; - cmd_idx = MPI3MR_NUM_DEVRMCMD; - } + cmd_idx = find_and_set_bit(mrioc->devrem_bitmap, MPI3MR_NUM_DEVRMCMD); + if (cmd_idx < MPI3MR_NUM_DEVRMCMD) + break; } while (retrycount--); if (cmd_idx >= MPI3MR_NUM_DEVRMCMD) { @@ -2417,14 +2413,9 @@ static void mpi3mr_send_event_ack(struct mpi3mr_ioc *mrioc, u8 event, "sending event ack in the top half for event(0x%02x), event_ctx(0x%08x)\n", event, event_ctx); do { - cmd_idx = find_first_zero_bit(mrioc->evtack_cmds_bitmap, - MPI3MR_NUM_EVTACKCMD); - if (cmd_idx < MPI3MR_NUM_EVTACKCMD) { - if (!test_and_set_bit(cmd_idx, - mrioc->evtack_cmds_bitmap)) - break; - cmd_idx = MPI3MR_NUM_EVTACKCMD; - } + cmd_idx = find_and_set_bit(mrioc->evtack_cmds_bitmap, MPI3MR_NUM_EVTACKCMD); + if (cmd_idx < MPI3MR_NUM_EVTACKCMD) + break; } while (retrycount--); if (cmd_idx >= MPI3MR_NUM_EVTACKCMD) { From 2cedc5c4cbed533323b2de3147dcae85d1b656f6 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Mon, 11 Dec 2023 18:27:31 -0800 Subject: [PATCH 052/707] scsi: qedi: optimize qedi_get_task_idx() by using find_and_set_bit() qedi_get_task_idx() opencodes find_and_set_bit(). Simplify it and make the whole function a simiple almost one-liner. CC: Bart Van Assche Signed-off-by: Yury Norov --- drivers/scsi/qedi/qedi_main.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/drivers/scsi/qedi/qedi_main.c b/drivers/scsi/qedi/qedi_main.c index cd0180b1f5b9da..2f940c6898ef3d 100644 --- a/drivers/scsi/qedi/qedi_main.c +++ b/drivers/scsi/qedi/qedi_main.c @@ -1824,20 +1824,13 @@ int qedi_get_task_idx(struct qedi_ctx *qedi) { s16 tmp_idx; -again: - tmp_idx = find_first_zero_bit(qedi->task_idx_map, - MAX_ISCSI_TASK_ENTRIES); + tmp_idx = find_and_set_bit(qedi->task_idx_map, MAX_ISCSI_TASK_ENTRIES); if (tmp_idx >= MAX_ISCSI_TASK_ENTRIES) { QEDI_ERR(&qedi->dbg_ctx, "FW task context pool is full.\n"); tmp_idx = -1; - goto err_idx; } - if (test_and_set_bit(tmp_idx, qedi->task_idx_map)) - goto again; - -err_idx: return tmp_idx; } From 1e9e099525e5fea93c761c289a8f30542b306da4 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Mon, 11 Dec 2023 18:27:32 -0800 Subject: [PATCH 053/707] powerpc: optimize arch code by using atomic find_bit() API Use find_and_{set,clear}_bit() where appropriate and simplify the logic. Signed-off-by: Yury Norov --- arch/powerpc/mm/book3s32/mmu_context.c | 10 ++--- arch/powerpc/platforms/pasemi/dma_lib.c | 45 +++++----------------- arch/powerpc/platforms/powernv/pci-sriov.c | 12 ++---- 3 files changed, 17 insertions(+), 50 deletions(-) diff --git a/arch/powerpc/mm/book3s32/mmu_context.c b/arch/powerpc/mm/book3s32/mmu_context.c index 1922f9a6b05850..7db19f173c2ed6 100644 --- a/arch/powerpc/mm/book3s32/mmu_context.c +++ b/arch/powerpc/mm/book3s32/mmu_context.c @@ -50,13 +50,11 @@ static unsigned long context_map[LAST_CONTEXT / BITS_PER_LONG + 1]; unsigned long __init_new_context(void) { - unsigned long ctx = next_mmu_context; + unsigned long ctx; - while (test_and_set_bit(ctx, context_map)) { - ctx = find_next_zero_bit(context_map, LAST_CONTEXT+1, ctx); - if (ctx > LAST_CONTEXT) - ctx = 0; - } + ctx = find_and_set_next_bit(context_map, LAST_CONTEXT + 1, next_mmu_context); + if (ctx > LAST_CONTEXT) + ctx = 0; next_mmu_context = (ctx + 1) & LAST_CONTEXT; return ctx; diff --git a/arch/powerpc/platforms/pasemi/dma_lib.c b/arch/powerpc/platforms/pasemi/dma_lib.c index 1be1f18f6f0982..906dabee013249 100644 --- a/arch/powerpc/platforms/pasemi/dma_lib.c +++ b/arch/powerpc/platforms/pasemi/dma_lib.c @@ -118,14 +118,9 @@ static int pasemi_alloc_tx_chan(enum pasemi_dmachan_type type) limit = MAX_TXCH; break; } -retry: - bit = find_next_bit(txch_free, MAX_TXCH, start); - if (bit >= limit) - return -ENOSPC; - if (!test_and_clear_bit(bit, txch_free)) - goto retry; - - return bit; + + bit = find_and_clear_next_bit(txch_free, MAX_TXCH, start); + return bit < limit ? bit : -ENOSPC; } static void pasemi_free_tx_chan(int chan) @@ -136,15 +131,9 @@ static void pasemi_free_tx_chan(int chan) static int pasemi_alloc_rx_chan(void) { - int bit; -retry: - bit = find_first_bit(rxch_free, MAX_RXCH); - if (bit >= MAX_TXCH) - return -ENOSPC; - if (!test_and_clear_bit(bit, rxch_free)) - goto retry; - - return bit; + int bit = find_and_clear_bit(rxch_free, MAX_RXCH); + + return bit < MAX_TXCH ? bit : -ENOSPC; } static void pasemi_free_rx_chan(int chan) @@ -374,16 +363,9 @@ EXPORT_SYMBOL(pasemi_dma_free_buf); */ int pasemi_dma_alloc_flag(void) { - int bit; + int bit = find_and_clear_bit(flags_free, MAX_FLAGS); -retry: - bit = find_first_bit(flags_free, MAX_FLAGS); - if (bit >= MAX_FLAGS) - return -ENOSPC; - if (!test_and_clear_bit(bit, flags_free)) - goto retry; - - return bit; + return bit < MAX_FLAGS ? bit : -ENOSPC; } EXPORT_SYMBOL(pasemi_dma_alloc_flag); @@ -439,16 +421,9 @@ EXPORT_SYMBOL(pasemi_dma_clear_flag); */ int pasemi_dma_alloc_fun(void) { - int bit; - -retry: - bit = find_first_bit(fun_free, MAX_FLAGS); - if (bit >= MAX_FLAGS) - return -ENOSPC; - if (!test_and_clear_bit(bit, fun_free)) - goto retry; + int bit = find_and_clear_bit(fun_free, MAX_FLAGS); - return bit; + return bit < MAX_FLAGS ? bit : -ENOSPC; } EXPORT_SYMBOL(pasemi_dma_alloc_fun); diff --git a/arch/powerpc/platforms/powernv/pci-sriov.c b/arch/powerpc/platforms/powernv/pci-sriov.c index 59882da3e74253..640e387e6d839c 100644 --- a/arch/powerpc/platforms/powernv/pci-sriov.c +++ b/arch/powerpc/platforms/powernv/pci-sriov.c @@ -397,18 +397,12 @@ static int64_t pnv_ioda_map_m64_single(struct pnv_phb *phb, static int pnv_pci_alloc_m64_bar(struct pnv_phb *phb, struct pnv_iov_data *iov) { - int win; + int win = find_and_set_bit(&phb->ioda.m64_bar_alloc, phb->ioda.m64_bar_idx + 1); - do { - win = find_next_zero_bit(&phb->ioda.m64_bar_alloc, - phb->ioda.m64_bar_idx + 1, 0); - - if (win >= phb->ioda.m64_bar_idx + 1) - return -1; - } while (test_and_set_bit(win, &phb->ioda.m64_bar_alloc)); + if (win >= phb->ioda.m64_bar_idx + 1) + return -1; set_bit(win, iov->used_m64_bar_mask); - return win; } From 2ebbcd7bcb17f6695329416e82256ce87c362eaa Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Mon, 11 Dec 2023 18:27:33 -0800 Subject: [PATCH 054/707] iommu: optimize subsystem by using atomic find_bit() API Simplify __arm_smmu_alloc_bitmap() and msm_iommu_alloc_ctx() by using a dedicated API, and make them nice one-liner wrappers. While here, refactor msm_iommu_attach_dev() and msm_iommu_alloc_ctx() so that error codes don't mismatch. Signed-off-by: Yury Norov --- drivers/iommu/arm/arm-smmu/arm-smmu.h | 10 ++-------- drivers/iommu/msm_iommu.c | 18 ++++-------------- 2 files changed, 6 insertions(+), 22 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.h b/drivers/iommu/arm/arm-smmu/arm-smmu.h index 703fd5817ec11f..004a4704ebf15c 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu.h +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.h @@ -453,15 +453,9 @@ struct arm_smmu_impl { static inline int __arm_smmu_alloc_bitmap(unsigned long *map, int start, int end) { - int idx; + int idx = find_and_set_next_bit(map, end, start); - do { - idx = find_next_zero_bit(map, end, start); - if (idx == end) - return -ENOSPC; - } while (test_and_set_bit(idx, map)); - - return idx; + return idx < end ? idx : -ENOSPC; } static inline void __iomem *arm_smmu_page(struct arm_smmu_device *smmu, int n) diff --git a/drivers/iommu/msm_iommu.c b/drivers/iommu/msm_iommu.c index f86af9815d6f98..67124f4228b1f0 100644 --- a/drivers/iommu/msm_iommu.c +++ b/drivers/iommu/msm_iommu.c @@ -185,17 +185,9 @@ static const struct iommu_flush_ops msm_iommu_flush_ops = { .tlb_add_page = __flush_iotlb_page, }; -static int msm_iommu_alloc_ctx(unsigned long *map, int start, int end) +static int msm_iommu_alloc_ctx(struct msm_iommu_dev *iommu) { - int idx; - - do { - idx = find_next_zero_bit(map, end, start); - if (idx == end) - return -ENOSPC; - } while (test_and_set_bit(idx, map)); - - return idx; + return find_and_set_bit(iommu->context_map, iommu->ncb); } static void msm_iommu_free_ctx(unsigned long *map, int idx) @@ -418,10 +410,8 @@ static int msm_iommu_attach_dev(struct iommu_domain *domain, struct device *dev) ret = -EEXIST; goto fail; } - master->num = - msm_iommu_alloc_ctx(iommu->context_map, - 0, iommu->ncb); - if (IS_ERR_VALUE(master->num)) { + master->num = msm_iommu_alloc_ctx(iommu); + if (master->num >= iommu->ncb) { ret = -ENODEV; goto fail; } From 4678bace092ce861e09a6ab4dd3f0e043bdb49a5 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Mon, 11 Dec 2023 18:27:34 -0800 Subject: [PATCH 055/707] media: radio-shark: optimize the driver by using atomic find_bit() API Despite that it's only 2- or 3-bit maps, convert for-loop followed by test_bit() to for_each_test_and_clear_bit() as it makes the code cleaner. Signed-off-by: Yury Norov Acked-by: Hans Verkuil --- drivers/media/radio/radio-shark.c | 5 +---- drivers/media/radio/radio-shark2.c | 5 +---- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/drivers/media/radio/radio-shark.c b/drivers/media/radio/radio-shark.c index 127a3be0e0f070..0c50b3a9623e88 100644 --- a/drivers/media/radio/radio-shark.c +++ b/drivers/media/radio/radio-shark.c @@ -158,10 +158,7 @@ static void shark_led_work(struct work_struct *work) container_of(work, struct shark_device, led_work); int i, res, brightness, actual_len; - for (i = 0; i < 3; i++) { - if (!test_and_clear_bit(i, &shark->brightness_new)) - continue; - + for_each_test_and_clear_bit(i, &shark->brightness_new, 3) { brightness = atomic_read(&shark->brightness[i]); memset(shark->transfer_buffer, 0, TB_LEN); if (i != RED_LED) { diff --git a/drivers/media/radio/radio-shark2.c b/drivers/media/radio/radio-shark2.c index f1c5c0a6a335cb..d9ef241e177806 100644 --- a/drivers/media/radio/radio-shark2.c +++ b/drivers/media/radio/radio-shark2.c @@ -145,10 +145,7 @@ static void shark_led_work(struct work_struct *work) container_of(work, struct shark_device, led_work); int i, res, brightness, actual_len; - for (i = 0; i < 2; i++) { - if (!test_and_clear_bit(i, &shark->brightness_new)) - continue; - + for_each_test_and_clear_bit(i, &shark->brightness_new, 2) { brightness = atomic_read(&shark->brightness[i]); memset(shark->transfer_buffer, 0, TB_LEN); shark->transfer_buffer[0] = 0x83 + i; From 7b39dbf951db07db3001f6113db0765c9598c00a Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Mon, 11 Dec 2023 18:27:35 -0800 Subject: [PATCH 056/707] sfc: optimize the driver by using atomic find_bit() API SFC code traverses rps_slot_map and rxq_retry_mask bit by bit. Simplify it by using dedicated atomic find_bit() functions, as they skip already clear bits. Signed-off-by: Yury Norov Reviewed-by: Edward Cree --- drivers/net/ethernet/sfc/rx_common.c | 4 +--- drivers/net/ethernet/sfc/siena/rx_common.c | 4 +--- drivers/net/ethernet/sfc/siena/siena_sriov.c | 14 ++++++-------- 3 files changed, 8 insertions(+), 14 deletions(-) diff --git a/drivers/net/ethernet/sfc/rx_common.c b/drivers/net/ethernet/sfc/rx_common.c index d2f35ee15effeb..0112968b3fe7c6 100644 --- a/drivers/net/ethernet/sfc/rx_common.c +++ b/drivers/net/ethernet/sfc/rx_common.c @@ -950,9 +950,7 @@ int efx_filter_rfs(struct net_device *net_dev, const struct sk_buff *skb, int rc; /* find a free slot */ - for (slot_idx = 0; slot_idx < EFX_RPS_MAX_IN_FLIGHT; slot_idx++) - if (!test_and_set_bit(slot_idx, &efx->rps_slot_map)) - break; + slot_idx = find_and_set_bit(&efx->rps_slot_map, EFX_RPS_MAX_IN_FLIGHT); if (slot_idx >= EFX_RPS_MAX_IN_FLIGHT) return -EBUSY; diff --git a/drivers/net/ethernet/sfc/siena/rx_common.c b/drivers/net/ethernet/sfc/siena/rx_common.c index 4579f43484c367..160b16aa74862b 100644 --- a/drivers/net/ethernet/sfc/siena/rx_common.c +++ b/drivers/net/ethernet/sfc/siena/rx_common.c @@ -958,9 +958,7 @@ int efx_siena_filter_rfs(struct net_device *net_dev, const struct sk_buff *skb, int rc; /* find a free slot */ - for (slot_idx = 0; slot_idx < EFX_RPS_MAX_IN_FLIGHT; slot_idx++) - if (!test_and_set_bit(slot_idx, &efx->rps_slot_map)) - break; + slot_idx = find_and_set_bit(&efx->rps_slot_map, EFX_RPS_MAX_IN_FLIGHT); if (slot_idx >= EFX_RPS_MAX_IN_FLIGHT) return -EBUSY; diff --git a/drivers/net/ethernet/sfc/siena/siena_sriov.c b/drivers/net/ethernet/sfc/siena/siena_sriov.c index 8353c15dc23336..554b799288b8e2 100644 --- a/drivers/net/ethernet/sfc/siena/siena_sriov.c +++ b/drivers/net/ethernet/sfc/siena/siena_sriov.c @@ -722,14 +722,12 @@ static int efx_vfdi_fini_all_queues(struct siena_vf *vf) efx_vfdi_flush_wake(vf), timeout); rxqs_count = 0; - for (index = 0; index < count; ++index) { - if (test_and_clear_bit(index, vf->rxq_retry_mask)) { - atomic_dec(&vf->rxq_retry_count); - MCDI_SET_ARRAY_DWORD( - inbuf, FLUSH_RX_QUEUES_IN_QID_OFST, - rxqs_count, vf_offset + index); - rxqs_count++; - } + for_each_test_and_clear_bit(index, vf->rxq_retry_mask, count) { + atomic_dec(&vf->rxq_retry_count); + MCDI_SET_ARRAY_DWORD( + inbuf, FLUSH_RX_QUEUES_IN_QID_OFST, + rxqs_count, vf_offset + index); + rxqs_count++; } } From 468cf2a9e8267d38c3756ea6576db439d9e219c1 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Mon, 11 Dec 2023 18:27:36 -0800 Subject: [PATCH 057/707] tty: nozomi: optimize interrupt_handler() In the exit path of interrupt_handler(), dc->flip map is traversed bit by bit to find and clear set bits and call tty_flip_buffer_push() for corresponding ports. Simplify it by using for_each_test_and_clear_bit(), as it skips already clear bits. Signed-off-by: Yury Norov --- drivers/tty/nozomi.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/tty/nozomi.c b/drivers/tty/nozomi.c index 02cd40147b3a80..de0503247391a5 100644 --- a/drivers/tty/nozomi.c +++ b/drivers/tty/nozomi.c @@ -1220,9 +1220,8 @@ static irqreturn_t interrupt_handler(int irq, void *dev_id) exit_handler: spin_unlock(&dc->spin_mutex); - for (a = 0; a < NOZOMI_MAX_PORTS; a++) - if (test_and_clear_bit(a, &dc->flip)) - tty_flip_buffer_push(&dc->port[a].port); + for_each_test_and_clear_bit(a, &dc->flip, NOZOMI_MAX_PORTS) + tty_flip_buffer_push(&dc->port[a].port); return IRQ_HANDLED; none: From d744113d7dacab078a5272378ee6a2c478d50cc5 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Mon, 11 Dec 2023 18:27:37 -0800 Subject: [PATCH 058/707] usb: cdc-acm: optimize acm_softint() acm_softint() uses for-loop to traverse urbs_in_error_delay bitmap bit by bit to find and clear set bits. Simplify it by using for_each_test_and_clear_bit(), because it doesn't test already clear bits. Signed-off-by: Yury Norov Acked-by: Oliver Neukum --- drivers/usb/class/cdc-acm.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/usb/class/cdc-acm.c b/drivers/usb/class/cdc-acm.c index a1f4e1ead97ff4..8664b63050b0c7 100644 --- a/drivers/usb/class/cdc-acm.c +++ b/drivers/usb/class/cdc-acm.c @@ -613,9 +613,8 @@ static void acm_softint(struct work_struct *work) } if (test_and_clear_bit(ACM_ERROR_DELAY, &acm->flags)) { - for (i = 0; i < acm->rx_buflimit; i++) - if (test_and_clear_bit(i, &acm->urbs_in_error_delay)) - acm_submit_read_urb(acm, i, GFP_KERNEL); + for_each_test_and_clear_bit(i, &acm->urbs_in_error_delay, acm->rx_buflimit) + acm_submit_read_urb(acm, i, GFP_KERNEL); } if (test_and_clear_bit(EVENT_TTY_WAKEUP, &acm->flags)) From f3687f2f7db4f0f464ab5cb43595c0964655eded Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Mon, 11 Dec 2023 18:27:38 -0800 Subject: [PATCH 059/707] block: null_blk: replace get_tag() with a generic find_and_set_bit_lock() get_tag() opencodes find_and_set_bit(). Simplify the code by getting rid of it. Signed-off-by: Yury Norov Reviewed-by: Chengming Zhou Reviewed-by: Jan Kara --- drivers/block/null_blk/main.c | 41 +++++++++++------------------------ 1 file changed, 13 insertions(+), 28 deletions(-) diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index 3021d58ca51c1f..671dbb9ab928af 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -760,19 +760,6 @@ static void put_tag(struct nullb_queue *nq, unsigned int tag) wake_up(&nq->wait); } -static unsigned int get_tag(struct nullb_queue *nq) -{ - unsigned int tag; - - do { - tag = find_first_zero_bit(nq->tag_map, nq->queue_depth); - if (tag >= nq->queue_depth) - return -1U; - } while (test_and_set_bit_lock(tag, nq->tag_map)); - - return tag; -} - static void free_cmd(struct nullb_cmd *cmd) { put_tag(cmd->nq, cmd->tag); @@ -782,24 +769,22 @@ static enum hrtimer_restart null_cmd_timer_expired(struct hrtimer *timer); static struct nullb_cmd *__alloc_cmd(struct nullb_queue *nq) { + unsigned int tag = find_and_set_bit_lock(nq->tag_map, nq->queue_depth); struct nullb_cmd *cmd; - unsigned int tag; - - tag = get_tag(nq); - if (tag != -1U) { - cmd = &nq->cmds[tag]; - cmd->tag = tag; - cmd->error = BLK_STS_OK; - cmd->nq = nq; - if (nq->dev->irqmode == NULL_IRQ_TIMER) { - hrtimer_init(&cmd->timer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL); - cmd->timer.function = null_cmd_timer_expired; - } - return cmd; + + if (tag >= nq->queue_depth) + return NULL; + + cmd = &nq->cmds[tag]; + cmd->tag = tag; + cmd->error = BLK_STS_OK; + cmd->nq = nq; + if (nq->dev->irqmode == NULL_IRQ_TIMER) { + hrtimer_init(&cmd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + cmd->timer.function = null_cmd_timer_expired; } - return NULL; + return cmd; } static struct nullb_cmd *alloc_cmd(struct nullb_queue *nq, struct bio *bio) From fe80b801ee439640f9a1b9df80b7ae0bc4d4bfb8 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Mon, 11 Dec 2023 18:27:39 -0800 Subject: [PATCH 060/707] RDMA/rtrs: optimize __rtrs_get_permit() by using find_and_set_bit_lock() The function opencodes find_and_set_bit_lock() with a while-loop polling on test_and_set_bit_lock(). Use the dedicated function instead. Signed-off-by: Yury Norov --- drivers/infiniband/ulp/rtrs/rtrs-clt.c | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/drivers/infiniband/ulp/rtrs/rtrs-clt.c b/drivers/infiniband/ulp/rtrs/rtrs-clt.c index 07261523c55473..2f3b0ad42e8aa7 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-clt.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-clt.c @@ -72,18 +72,9 @@ __rtrs_get_permit(struct rtrs_clt_sess *clt, enum rtrs_clt_con_type con_type) struct rtrs_permit *permit; int bit; - /* - * Adapted from null_blk get_tag(). Callers from different cpus may - * grab the same bit, since find_first_zero_bit is not atomic. - * But then the test_and_set_bit_lock will fail for all the - * callers but one, so that they will loop again. - * This way an explicit spinlock is not required. - */ - do { - bit = find_first_zero_bit(clt->permits_map, max_depth); - if (bit >= max_depth) - return NULL; - } while (test_and_set_bit_lock(bit, clt->permits_map)); + bit = find_and_set_bit_lock(clt->permits_map, max_depth); + if (bit >= max_depth) + return NULL; permit = get_permit(clt, bit); WARN_ON(permit->mem_id != bit); From fea0ea785cef13d881cdc3c5136d98851f14335f Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Mon, 11 Dec 2023 18:27:40 -0800 Subject: [PATCH 061/707] mISDN: optimize get_free_devid() get_free_devid() traverses each bit in device_ids in an open-coded loop. Simplify it by using the dedicated find_and_set_bit(). It makes the whole function a nice one-liner, and because MAX_DEVICE_ID is a small constant-time value (63), on 64-bit platforms find_and_set_bit() call will be optimized to: ffs(); test_and_set_bit(). Signed-off-by: Yury Norov --- drivers/isdn/mISDN/core.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/drivers/isdn/mISDN/core.c b/drivers/isdn/mISDN/core.c index ab8513a7acd52d..c829c4eac0e23e 100644 --- a/drivers/isdn/mISDN/core.c +++ b/drivers/isdn/mISDN/core.c @@ -197,14 +197,9 @@ get_mdevice_count(void) static int get_free_devid(void) { - u_int i; + int i = find_and_set_bit((u_long *)&device_ids, MAX_DEVICE_ID + 1); - for (i = 0; i <= MAX_DEVICE_ID; i++) - if (!test_and_set_bit(i, (u_long *)&device_ids)) - break; - if (i > MAX_DEVICE_ID) - return -EBUSY; - return i; + return i <= MAX_DEVICE_ID ? i : -EBUSY; } int From fbde99eaa647bc93ca3fbf78f4cec929beabf0f7 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Mon, 11 Dec 2023 18:27:41 -0800 Subject: [PATCH 062/707] media: em28xx: cx231xx: optimize drivers by using find_and_set_bit() Functions in the media/usb drivers opencode find_and_set_bit(). Simplify them by using the function. Signed-off-by: Yury Norov Acked-by: Hans Verkuil --- drivers/media/usb/cx231xx/cx231xx-cards.c | 16 ++++------ drivers/media/usb/em28xx/em28xx-cards.c | 37 +++++++++-------------- 2 files changed, 21 insertions(+), 32 deletions(-) diff --git a/drivers/media/usb/cx231xx/cx231xx-cards.c b/drivers/media/usb/cx231xx/cx231xx-cards.c index 92efe6c1f47bae..b314603932d772 100644 --- a/drivers/media/usb/cx231xx/cx231xx-cards.c +++ b/drivers/media/usb/cx231xx/cx231xx-cards.c @@ -1708,16 +1708,12 @@ static int cx231xx_usb_probe(struct usb_interface *interface, return -ENODEV; /* Check to see next free device and mark as used */ - do { - nr = find_first_zero_bit(&cx231xx_devused, CX231XX_MAXBOARDS); - if (nr >= CX231XX_MAXBOARDS) { - /* No free device slots */ - dev_err(d, - "Supports only %i devices.\n", - CX231XX_MAXBOARDS); - return -ENOMEM; - } - } while (test_and_set_bit(nr, &cx231xx_devused)); + nr = find_and_set_bit(&cx231xx_devused, CX231XX_MAXBOARDS); + if (nr >= CX231XX_MAXBOARDS) { + /* No free device slots */ + dev_err(d, "Supports only %i devices.\n", CX231XX_MAXBOARDS); + return -ENOMEM; + } udev = usb_get_dev(interface_to_usbdev(interface)); diff --git a/drivers/media/usb/em28xx/em28xx-cards.c b/drivers/media/usb/em28xx/em28xx-cards.c index 4d037c92af7c58..af4809fe74a857 100644 --- a/drivers/media/usb/em28xx/em28xx-cards.c +++ b/drivers/media/usb/em28xx/em28xx-cards.c @@ -3684,17 +3684,14 @@ static int em28xx_duplicate_dev(struct em28xx *dev) return -ENOMEM; } /* Check to see next free device and mark as used */ - do { - nr = find_first_zero_bit(em28xx_devused, EM28XX_MAXBOARDS); - if (nr >= EM28XX_MAXBOARDS) { - /* No free device slots */ - dev_warn(&dev->intf->dev, ": Supports only %i em28xx boards.\n", - EM28XX_MAXBOARDS); - kfree(sec_dev); - dev->dev_next = NULL; - return -ENOMEM; - } - } while (test_and_set_bit(nr, em28xx_devused)); + nr = find_and_set_bit(em28xx_devused, EM28XX_MAXBOARDS); + if (nr >= EM28XX_MAXBOARDS) { + /* No free device slots */ + dev_warn(&dev->intf->dev, ": Supports only %i em28xx boards.\n", EM28XX_MAXBOARDS); + kfree(sec_dev); + dev->dev_next = NULL; + return -ENOMEM; + } sec_dev->devno = nr; snprintf(sec_dev->name, 28, "em28xx #%d", nr); sec_dev->dev_next = NULL; @@ -3827,17 +3824,13 @@ static int em28xx_usb_probe(struct usb_interface *intf, udev = usb_get_dev(interface_to_usbdev(intf)); /* Check to see next free device and mark as used */ - do { - nr = find_first_zero_bit(em28xx_devused, EM28XX_MAXBOARDS); - if (nr >= EM28XX_MAXBOARDS) { - /* No free device slots */ - dev_err(&intf->dev, - "Driver supports up to %i em28xx boards.\n", - EM28XX_MAXBOARDS); - retval = -ENOMEM; - goto err_no_slot; - } - } while (test_and_set_bit(nr, em28xx_devused)); + nr = find_and_set_bit(em28xx_devused, EM28XX_MAXBOARDS); + if (nr >= EM28XX_MAXBOARDS) { + /* No free device slots */ + dev_err(&intf->dev, "Driver supports up to %i em28xx boards.\n", EM28XX_MAXBOARDS); + retval = -ENOMEM; + goto err_no_slot; + } /* Don't register audio interfaces */ if (intf->altsetting[0].desc.bInterfaceClass == USB_CLASS_AUDIO) { From 35a11cd220c710b8c42203cf07aea6f6f873f6a5 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Mon, 11 Dec 2023 18:27:42 -0800 Subject: [PATCH 063/707] ethernet: rocker: optimize ofdpa_port_internal_vlan_id_get() Optimize ofdpa_port_internal_vlan_id_get() by using find_and_set_bit(), instead of polling every bit from bitmap in a for-loop. Signed-off-by: Yury Norov --- drivers/net/ethernet/rocker/rocker_ofdpa.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/rocker/rocker_ofdpa.c b/drivers/net/ethernet/rocker/rocker_ofdpa.c index 826990459fa443..449be8af7ffce6 100644 --- a/drivers/net/ethernet/rocker/rocker_ofdpa.c +++ b/drivers/net/ethernet/rocker/rocker_ofdpa.c @@ -2249,14 +2249,11 @@ static __be16 ofdpa_port_internal_vlan_id_get(struct ofdpa_port *ofdpa_port, found = entry; hash_add(ofdpa->internal_vlan_tbl, &found->entry, found->ifindex); - for (i = 0; i < OFDPA_N_INTERNAL_VLANS; i++) { - if (test_and_set_bit(i, ofdpa->internal_vlan_bitmap)) - continue; + i = find_and_set_bit(ofdpa->internal_vlan_bitmap, OFDPA_N_INTERNAL_VLANS); + if (i < OFDPA_N_INTERNAL_VLANS) found->vlan_id = htons(OFDPA_INTERNAL_VLAN_ID_BASE + i); - goto found; - } - - netdev_err(ofdpa_port->dev, "Out of internal VLAN IDs\n"); + else + netdev_err(ofdpa_port->dev, "Out of internal VLAN IDs\n"); found: found->ref_count++; From e63a961be48f2855a76d034e23471995ad6b9972 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Mon, 11 Dec 2023 18:27:43 -0800 Subject: [PATCH 064/707] serial: sc12is7xx: optimize sc16is7xx_alloc_line() Instead of polling every bit in sc16is7xx_lines, use a dedicated find_and_set_bit(), and make the function a simple one-liner. Signed-off-by: Yury Norov --- drivers/tty/serial/sc16is7xx.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/drivers/tty/serial/sc16is7xx.c b/drivers/tty/serial/sc16is7xx.c index db2bb1c0d36c26..6a463988d5e002 100644 --- a/drivers/tty/serial/sc16is7xx.c +++ b/drivers/tty/serial/sc16is7xx.c @@ -427,15 +427,9 @@ static void sc16is7xx_port_update(struct uart_port *port, u8 reg, static int sc16is7xx_alloc_line(void) { - int i; - BUILD_BUG_ON(SC16IS7XX_MAX_DEVS > BITS_PER_LONG); - for (i = 0; i < SC16IS7XX_MAX_DEVS; i++) - if (!test_and_set_bit(i, &sc16is7xx_lines)) - break; - - return i; + return find_and_set_bit(&sc16is7xx_lines, SC16IS7XX_MAX_DEVS); } static void sc16is7xx_power(struct uart_port *port, int on) From 137ce860bc80dd529ab6b04dde2a6e761b32b3ec Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Mon, 11 Dec 2023 18:27:44 -0800 Subject: [PATCH 065/707] bluetooth: optimize cmtp_alloc_block_id() Instead of polling every bit in blockids, use a dedicated find_and_set_bit(), and make the function a simple one-liner. Signed-off-by: Yury Norov --- net/bluetooth/cmtp/core.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/net/bluetooth/cmtp/core.c b/net/bluetooth/cmtp/core.c index 90d130588a3e51..b1330acbbff366 100644 --- a/net/bluetooth/cmtp/core.c +++ b/net/bluetooth/cmtp/core.c @@ -88,15 +88,9 @@ static void __cmtp_copy_session(struct cmtp_session *session, struct cmtp_connin static inline int cmtp_alloc_block_id(struct cmtp_session *session) { - int i, id = -1; + int id = find_and_set_bit(&session->blockids, 16); - for (i = 0; i < 16; i++) - if (!test_and_set_bit(i, &session->blockids)) { - id = i; - break; - } - - return id; + return id < 16 ? id : -1; } static inline void cmtp_free_block_id(struct cmtp_session *session, int id) From 668284d460b55af73b9a001aad57b25eb1f674b7 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Mon, 11 Dec 2023 18:27:45 -0800 Subject: [PATCH 066/707] net: smc: optimize smc_wr_tx_get_free_slot_index() Simplify the function by using find_and_set_bit() and make it a simple almost one-liner. While here, drop explicit initialization of *idx, because it's already initialized by the caller in case of ENOLINK, or set properly with ->wr_tx_mask, if nothing is found, in case of EBUSY. CC: Tony Lu Signed-off-by: Yury Norov Reviewed-by: Alexandra Winter Reviewed-by: Wen Gu --- net/smc/smc_wr.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index 0021065a600a03..b6f0cfc527882f 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -170,15 +170,11 @@ void smc_wr_tx_cq_handler(struct ib_cq *ib_cq, void *cq_context) static inline int smc_wr_tx_get_free_slot_index(struct smc_link *link, u32 *idx) { - *idx = link->wr_tx_cnt; if (!smc_link_sendable(link)) return -ENOLINK; - for_each_clear_bit(*idx, link->wr_tx_mask, link->wr_tx_cnt) { - if (!test_and_set_bit(*idx, link->wr_tx_mask)) - return 0; - } - *idx = link->wr_tx_cnt; - return -EBUSY; + + *idx = find_and_set_bit(link->wr_tx_mask, link->wr_tx_cnt); + return *idx < link->wr_tx_cnt ? 0 : -EBUSY; } /** From 78cdf2d0f4565c5866eb1ec2105397835003f15d Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Mon, 11 Dec 2023 18:27:46 -0800 Subject: [PATCH 067/707] ALSA: use atomic find_bit() functions where applicable ALSA code tests each bit in bitmaps in a for() loop. Switch it to using dedicated atomic find_bit() API. Signed-off-by: Yury Norov Acked-by: Takashi Iwai --- sound/pci/hda/hda_codec.c | 7 +++---- sound/usb/caiaq/audio.c | 13 +++++-------- 2 files changed, 8 insertions(+), 12 deletions(-) diff --git a/sound/pci/hda/hda_codec.c b/sound/pci/hda/hda_codec.c index 01718b1fc9a7f8..29254005f3941a 100644 --- a/sound/pci/hda/hda_codec.c +++ b/sound/pci/hda/hda_codec.c @@ -3275,10 +3275,9 @@ static int get_empty_pcm_device(struct hda_bus *bus, unsigned int type) #ifdef CONFIG_SND_DYNAMIC_MINORS /* non-fixed slots starting from 10 */ - for (i = 10; i < 32; i++) { - if (!test_and_set_bit(i, bus->pcm_dev_bits)) - return i; - } + i = find_and_set_next_bit(bus->pcm_dev_bits, 32, 10); + if (i < 32) + return i; #endif dev_warn(bus->card->dev, "Too many %s devices\n", diff --git a/sound/usb/caiaq/audio.c b/sound/usb/caiaq/audio.c index 4981753652a7fe..74dfcf32b439d2 100644 --- a/sound/usb/caiaq/audio.c +++ b/sound/usb/caiaq/audio.c @@ -610,7 +610,7 @@ static void read_completed(struct urb *urb) struct snd_usb_caiaq_cb_info *info = urb->context; struct snd_usb_caiaqdev *cdev; struct device *dev; - struct urb *out = NULL; + struct urb *out; int i, frame, len, send_it = 0, outframe = 0; unsigned long flags; size_t offset = 0; @@ -625,17 +625,14 @@ static void read_completed(struct urb *urb) return; /* find an unused output urb that is unused */ - for (i = 0; i < N_URBS; i++) - if (test_and_set_bit(i, &cdev->outurb_active_mask) == 0) { - out = cdev->data_urbs_out[i]; - break; - } - - if (!out) { + i = find_and_set_bit(&cdev->outurb_active_mask, N_URBS); + if (i >= N_URBS) { dev_err(dev, "Unable to find an output urb to use\n"); goto requeue; } + out = cdev->data_urbs_out[i]; + /* read the recently received packet and send back one which has * the same layout */ for (frame = 0; frame < FRAMES_PER_URB; frame++) { From 4d56bf2e0c2321894cdb654b96944710fbd62314 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Mon, 11 Dec 2023 18:27:47 -0800 Subject: [PATCH 068/707] m68k: optimize get_mmu_context() get_mmu_context() opencodes atomic find_and_set_bit_wrap(). Simplify it by using find_and_set_bit_wrap(). CC: Geert Uytterhoeven Signed-off-by: Yury Norov Acked-by: Greg Ungerer --- arch/m68k/include/asm/mmu_context.h | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/arch/m68k/include/asm/mmu_context.h b/arch/m68k/include/asm/mmu_context.h index 141bbdfad96019..0419ad87a1c122 100644 --- a/arch/m68k/include/asm/mmu_context.h +++ b/arch/m68k/include/asm/mmu_context.h @@ -35,12 +35,11 @@ static inline void get_mmu_context(struct mm_struct *mm) atomic_inc(&nr_free_contexts); steal_context(); } - ctx = next_mmu_context; - while (test_and_set_bit(ctx, context_map)) { - ctx = find_next_zero_bit(context_map, LAST_CONTEXT+1, ctx); - if (ctx > LAST_CONTEXT) - ctx = 0; - } + + do { + ctx = find_and_set_bit_wrap(context_map, LAST_CONTEXT + 1, next_mmu_context); + } while (ctx > LAST_CONTEXT); + next_mmu_context = (ctx + 1) & LAST_CONTEXT; mm->context = ctx; context_mm[ctx] = mm; From 18eda5a178066852986b0380f201295cefa582e7 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Mon, 11 Dec 2023 18:27:48 -0800 Subject: [PATCH 069/707] microblaze: optimize get_mmu_context() Simplify get_mmu_context() by using find_and_set_bit_wrap(). Signed-off-by: Yury Norov --- arch/microblaze/include/asm/mmu_context_mm.h | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/arch/microblaze/include/asm/mmu_context_mm.h b/arch/microblaze/include/asm/mmu_context_mm.h index c2c77f70845562..209c3a62353a99 100644 --- a/arch/microblaze/include/asm/mmu_context_mm.h +++ b/arch/microblaze/include/asm/mmu_context_mm.h @@ -82,12 +82,11 @@ static inline void get_mmu_context(struct mm_struct *mm) return; while (atomic_dec_if_positive(&nr_free_contexts) < 0) steal_context(); - ctx = next_mmu_context; - while (test_and_set_bit(ctx, context_map)) { - ctx = find_next_zero_bit(context_map, LAST_CONTEXT+1, ctx); - if (ctx > LAST_CONTEXT) - ctx = 0; - } + + do { + ctx = find_and_set_bit_wrap(context_map, LAST_CONTEXT + 1, next_mmu_context); + } while (ctx > LAST_CONTEXT); + next_mmu_context = (ctx + 1) & LAST_CONTEXT; mm->context = ctx; context_mm[ctx] = mm; From 5e95ee6fd52b06432da9636032ac6986112feec1 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Mon, 11 Dec 2023 18:27:49 -0800 Subject: [PATCH 070/707] sh: mach-x3proto: optimize ilsel_enable() Simplify ilsel_enable() by using find_and_set_bit(). CC: John Paul Adrian Glaubitz Signed-off-by: Yury Norov --- arch/sh/boards/mach-x3proto/ilsel.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/sh/boards/mach-x3proto/ilsel.c b/arch/sh/boards/mach-x3proto/ilsel.c index f0d5eb41521a49..7fadc479a80bf7 100644 --- a/arch/sh/boards/mach-x3proto/ilsel.c +++ b/arch/sh/boards/mach-x3proto/ilsel.c @@ -99,8 +99,8 @@ int ilsel_enable(ilsel_source_t set) } do { - bit = find_first_zero_bit(&ilsel_level_map, ILSEL_LEVELS); - } while (test_and_set_bit(bit, &ilsel_level_map)); + bit = find_and_set_bit(&ilsel_level_map, ILSEL_LEVELS); + } while (bit >= ILSEL_LEVELS); __ilsel_enable(set, bit); From f655182f9e9edda559b41a1f8b3b9c944443694a Mon Sep 17 00:00:00 2001 From: Kartik Date: Wed, 20 Dec 2023 11:40:13 +0530 Subject: [PATCH 071/707] soc/tegra: fuse: Define tegra194_soc_attr_group for Tegra241 Tegra241 SoC data uses tegra194_soc_attr_group, which is only defined if config CONFIG_ARCH_TEGRA_194_SOC or CONFIG_ARCH_TEGRA_234_SOC or both are enabled. This causes a build failure if both of these configs are disabled and CONFIG_ARCH_TEGRA_241_SOC is enabled. Define tegra194_soc_attr_group if CONFIG_ARCH_TEGRA_241_SOC is enabled. Signed-off-by: Kartik Acked-by: Randy Dunlap Tested-by: Randy Dunlap # build-tested Signed-off-by: Thierry Reding --- drivers/soc/tegra/fuse/fuse-tegra.c | 3 ++- drivers/soc/tegra/fuse/fuse.h | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/soc/tegra/fuse/fuse-tegra.c b/drivers/soc/tegra/fuse/fuse-tegra.c index 233b8e7bb41bf9..c34efa5bf44c2f 100644 --- a/drivers/soc/tegra/fuse/fuse-tegra.c +++ b/drivers/soc/tegra/fuse/fuse-tegra.c @@ -407,7 +407,8 @@ const struct attribute_group tegra_soc_attr_group = { }; #if IS_ENABLED(CONFIG_ARCH_TEGRA_194_SOC) || \ - IS_ENABLED(CONFIG_ARCH_TEGRA_234_SOC) + IS_ENABLED(CONFIG_ARCH_TEGRA_234_SOC) || \ + IS_ENABLED(CONFIG_ARCH_TEGRA_241_SOC) static ssize_t platform_show(struct device *dev, struct device_attribute *attr, char *buf) { diff --git a/drivers/soc/tegra/fuse/fuse.h b/drivers/soc/tegra/fuse/fuse.h index f3b705327c20f8..9fee6ad6ad9e98 100644 --- a/drivers/soc/tegra/fuse/fuse.h +++ b/drivers/soc/tegra/fuse/fuse.h @@ -124,7 +124,8 @@ extern const struct tegra_fuse_soc tegra186_fuse_soc; #endif #if IS_ENABLED(CONFIG_ARCH_TEGRA_194_SOC) || \ - IS_ENABLED(CONFIG_ARCH_TEGRA_234_SOC) + IS_ENABLED(CONFIG_ARCH_TEGRA_234_SOC) || \ + IS_ENABLED(CONFIG_ARCH_TEGRA_241_SOC) extern const struct attribute_group tegra194_soc_attr_group; #endif From 071ad962baf5e857fd965595421cf6fb588610ed Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 22 Dec 2023 16:04:02 +0200 Subject: [PATCH 072/707] bitmap: Step down as a reviewer Too many things are going on, and reviewing BITMAP related code seems not the best I can do, hence step down as a reviewer of the BITMAP library. Signed-off-by: Andy Shevchenko Signed-off-by: Yury Norov --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 788be9ab5b733a..51983ed2d4e483 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3553,7 +3553,6 @@ F: include/uapi/linux/bfs_fs.h BITMAP API M: Yury Norov -R: Andy Shevchenko R: Rasmus Villemoes S: Maintained F: include/linux/bitfield.h From cceb1ba628230a5fb076d7155731572d0393a903 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Wed, 27 Dec 2023 11:10:03 +0100 Subject: [PATCH 073/707] Bluetooth: hci_bcm4377: do not mark valid bd_addr as invalid A recent commit restored the original (and still documented) semantics for the HCI_QUIRK_USE_BDADDR_PROPERTY quirk so that the device address is considered invalid unless an address is provided by firmware. This specifically means that this flag must only be set for devices with invalid addresses, but the Broadcom BCM4377 driver has so far been setting this flag unconditionally. Fortunately the driver already checks for invalid addresses during setup and sets the HCI_QUIRK_INVALID_BDADDR flag, which can simply be replaced with HCI_QUIRK_USE_BDADDR_PROPERTY to indicate that the default address is invalid but can be overridden by firmware (long term, this should probably just always be allowed). Fixes: 6945795bc81a ("Bluetooth: fix use-bdaddr-property quirk") Cc: stable@vger.kernel.org # 6.5 Reported-by: Felix Zhang Link: https://lore.kernel.org/r/77419ffacc5b4875e920e038332575a2a5bff29f.camel@mrman314.tech/ Signed-off-by: Johan Hovold Reported-by: Felix Zhang Reviewed-by: Neal Gompa Signed-off-by: Luiz Augusto von Dentz --- drivers/bluetooth/hci_bcm4377.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/bluetooth/hci_bcm4377.c b/drivers/bluetooth/hci_bcm4377.c index a617578356953c..9a7243d5db71ff 100644 --- a/drivers/bluetooth/hci_bcm4377.c +++ b/drivers/bluetooth/hci_bcm4377.c @@ -1417,7 +1417,7 @@ static int bcm4377_check_bdaddr(struct bcm4377_data *bcm4377) bda = (struct hci_rp_read_bd_addr *)skb->data; if (!bcm4377_is_valid_bdaddr(bcm4377, &bda->bdaddr)) - set_bit(HCI_QUIRK_INVALID_BDADDR, &bcm4377->hdev->quirks); + set_bit(HCI_QUIRK_USE_BDADDR_PROPERTY, &bcm4377->hdev->quirks); kfree_skb(skb); return 0; @@ -2368,7 +2368,6 @@ static int bcm4377_probe(struct pci_dev *pdev, const struct pci_device_id *id) hdev->set_bdaddr = bcm4377_hci_set_bdaddr; hdev->setup = bcm4377_hci_setup; - set_bit(HCI_QUIRK_USE_BDADDR_PROPERTY, &hdev->quirks); if (bcm4377->hw->broken_mws_transport_config) set_bit(HCI_QUIRK_BROKEN_MWS_TRANSPORT_CONFIG, &hdev->quirks); if (bcm4377->hw->broken_ext_scan) From 8f9bb7a1b81ba24c52b11e7b8af6684c01659b78 Mon Sep 17 00:00:00 2001 From: Max Chou Date: Tue, 26 Dec 2023 19:45:17 +0800 Subject: [PATCH 074/707] Bluetooth: btrtl: Add the support for RTL8852BT/RTL8852BE-VT Add the support for RTL8852BT/RTL8852BE-VT BT controller on USB interface. The necessary firmware will be submitted to linux-firmware project. The device info from /sys/kernel/debug/usb/devices as below. T: Bus=02 Lev=02 Prnt=02 Port=05 Cnt=01 Dev#= 8 Spd=12 MxCh= 0 D: Ver= 1.00 Cls=e0(wlcon) Sub=01 Prot=01 MxPS=64 #Cfgs= 1 P: Vendor=0bda ProdID=8520 Rev= 0.00 S: Manufacturer=Realtek S: Product=Bluetooth Radio S: SerialNumber=00e04c000001 C:* #Ifs= 2 Cfg#= 1 Atr=e0 MxPwr=500mA I:* If#= 0 Alt= 0 #EPs= 3 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=81(I) Atr=03(Int.) MxPS= 16 Ivl=1ms E: Ad=02(O) Atr=02(Bulk) MxPS= 64 Ivl=0ms E: Ad=82(I) Atr=02(Bulk) MxPS= 64 Ivl=0ms I:* If#= 1 Alt= 0 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=03(O) Atr=01(Isoc) MxPS= 0 Ivl=1ms E: Ad=83(I) Atr=01(Isoc) MxPS= 0 Ivl=1ms I: If#= 1 Alt= 1 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=03(O) Atr=01(Isoc) MxPS= 9 Ivl=1ms E: Ad=83(I) Atr=01(Isoc) MxPS= 9 Ivl=1ms I: If#= 1 Alt= 2 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=03(O) Atr=01(Isoc) MxPS= 17 Ivl=1ms E: Ad=83(I) Atr=01(Isoc) MxPS= 17 Ivl=1ms I: If#= 1 Alt= 3 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=03(O) Atr=01(Isoc) MxPS= 25 Ivl=1ms E: Ad=83(I) Atr=01(Isoc) MxPS= 25 Ivl=1ms I: If#= 1 Alt= 4 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=03(O) Atr=01(Isoc) MxPS= 33 Ivl=1ms E: Ad=83(I) Atr=01(Isoc) MxPS= 33 Ivl=1ms I: If#= 1 Alt= 5 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=03(O) Atr=01(Isoc) MxPS= 49 Ivl=1ms E: Ad=83(I) Atr=01(Isoc) MxPS= 49 Ivl=1ms Signed-off-by: Max Chou Signed-off-by: Luiz Augusto von Dentz --- drivers/bluetooth/btrtl.c | 14 ++++++++++++++ drivers/bluetooth/btusb.c | 3 +++ 2 files changed, 17 insertions(+) diff --git a/drivers/bluetooth/btrtl.c b/drivers/bluetooth/btrtl.c index 277d039ecbb429..cc50de69e8dc98 100644 --- a/drivers/bluetooth/btrtl.c +++ b/drivers/bluetooth/btrtl.c @@ -69,6 +69,7 @@ enum btrtl_chip_id { CHIP_ID_8852B = 20, CHIP_ID_8852C = 25, CHIP_ID_8851B = 36, + CHIP_ID_8852BT = 47, }; struct id_table { @@ -307,6 +308,15 @@ static const struct id_table ic_id_table[] = { .fw_name = "rtl_bt/rtl8851bu_fw", .cfg_name = "rtl_bt/rtl8851bu_config", .hw_info = "rtl8851bu" }, + + /* 8852BT/8852BE-VT */ + { IC_INFO(RTL_ROM_LMP_8852A, 0x87, 0xc, HCI_USB), + .config_needed = false, + .has_rom_version = true, + .has_msft_ext = true, + .fw_name = "rtl_bt/rtl8852btu_fw", + .cfg_name = "rtl_bt/rtl8852btu_config", + .hw_info = "rtl8852btu" }, }; static const struct id_table *btrtl_match_ic(u16 lmp_subver, u16 hci_rev, @@ -645,6 +655,7 @@ static int rtlbt_parse_firmware(struct hci_dev *hdev, { RTL_ROM_LMP_8852A, 20 }, /* 8852B */ { RTL_ROM_LMP_8852A, 25 }, /* 8852C */ { RTL_ROM_LMP_8851B, 36 }, /* 8851B */ + { RTL_ROM_LMP_8852A, 47 }, /* 8852BT */ }; if (btrtl_dev->fw_len <= 8) @@ -1275,6 +1286,7 @@ void btrtl_set_quirks(struct hci_dev *hdev, struct btrtl_device_info *btrtl_dev) case CHIP_ID_8852B: case CHIP_ID_8852C: case CHIP_ID_8851B: + case CHIP_ID_8852BT: set_bit(HCI_QUIRK_VALID_LE_STATES, &hdev->quirks); set_bit(HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED, &hdev->quirks); @@ -1505,6 +1517,8 @@ MODULE_FIRMWARE("rtl_bt/rtl8852bs_fw.bin"); MODULE_FIRMWARE("rtl_bt/rtl8852bs_config.bin"); MODULE_FIRMWARE("rtl_bt/rtl8852bu_fw.bin"); MODULE_FIRMWARE("rtl_bt/rtl8852bu_config.bin"); +MODULE_FIRMWARE("rtl_bt/rtl8852btu_fw.bin"); +MODULE_FIRMWARE("rtl_bt/rtl8852btu_config.bin"); MODULE_FIRMWARE("rtl_bt/rtl8852cu_fw.bin"); MODULE_FIRMWARE("rtl_bt/rtl8852cu_fw_v2.bin"); MODULE_FIRMWARE("rtl_bt/rtl8852cu_config.bin"); diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c index 7835170b1d6618..a0a317bac0954f 100644 --- a/drivers/bluetooth/btusb.c +++ b/drivers/bluetooth/btusb.c @@ -553,6 +553,9 @@ static const struct usb_device_id quirks_table[] = { { USB_DEVICE(0x13d3, 0x3572), .driver_info = BTUSB_REALTEK | BTUSB_WIDEBAND_SPEECH }, + /* Realtek 8852BT/8852BE-VT Bluetooth devices */ + { USB_DEVICE(0x0bda, 0x8520), .driver_info = BTUSB_REALTEK | + BTUSB_WIDEBAND_SPEECH }, /* Realtek Bluetooth devices */ { USB_VENDOR_AND_INTERFACE_INFO(0x0bda, 0xe0, 0x01, 0x01), .driver_info = BTUSB_REALTEK }, From c7ee0bc8db325ec829bbe2cd0114071489ed915f Mon Sep 17 00:00:00 2001 From: Neeraj Sanjay Kale Date: Wed, 27 Dec 2023 18:59:27 +0530 Subject: [PATCH 075/707] Bluetooth: btnxpuart: Resolve TX timeout error in power save stress test This fixes the tx timeout issue seen while running a stress test on btnxpuart for couple of hours, such that the interval between two HCI commands coincide with the power save timeout value of 2 seconds. Test procedure using bash script: hciconfig hci0 up //Enable Power Save feature hcitool -i hci0 cmd 3f 23 02 00 00 while (true) do hciconfig hci0 leadv sleep 2 hciconfig hci0 noleadv sleep 2 done Error log, after adding few more debug prints: Bluetooth: btnxpuart_queue_skb(): 01 0A 20 01 00 Bluetooth: hci0: Set UART break: on, status=0 Bluetooth: hci0: btnxpuart_tx_wakeup() tx_work scheduled Bluetooth: hci0: btnxpuart_tx_work() dequeue: 01 0A 20 01 00 Can't set advertise mode on hci0: Connection timed out (110) Bluetooth: hci0: command 0x200a tx timeout When the power save mechanism turns on UART break, and btnxpuart_tx_work() is scheduled simultaneously, psdata->ps_state is read as PS_STATE_AWAKE, which prevents the psdata->work from being scheduled, which is responsible to turn OFF UART break. This issue is fixed by adding a ps_lock mutex around UART break on/off as well as around ps_state read/write. btnxpuart_tx_wakeup() will now read updated ps_state value. If ps_state is PS_STATE_SLEEP, it will first schedule psdata->work, and then it will reschedule itself once UART break has been turned off and ps_state is PS_STATE_AWAKE. Tested above script for 50,000 iterations and TX timeout error was not observed anymore. Signed-off-by: Neeraj Sanjay Kale Signed-off-by: Luiz Augusto von Dentz --- drivers/bluetooth/btnxpuart.c | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/drivers/bluetooth/btnxpuart.c b/drivers/bluetooth/btnxpuart.c index b7c56be078f815..7f88b6f52f2663 100644 --- a/drivers/bluetooth/btnxpuart.c +++ b/drivers/bluetooth/btnxpuart.c @@ -126,6 +126,7 @@ struct ps_data { struct hci_dev *hdev; struct work_struct work; struct timer_list ps_timer; + struct mutex ps_lock; }; struct wakeup_cmd_payload { @@ -317,6 +318,9 @@ static void ps_start_timer(struct btnxpuart_dev *nxpdev) if (psdata->cur_psmode == PS_MODE_ENABLE) mod_timer(&psdata->ps_timer, jiffies + msecs_to_jiffies(psdata->h2c_ps_interval)); + + if (psdata->ps_state == PS_STATE_AWAKE && psdata->ps_cmd == PS_CMD_ENTER_PS) + cancel_work_sync(&psdata->work); } static void ps_cancel_timer(struct btnxpuart_dev *nxpdev) @@ -337,6 +341,7 @@ static void ps_control(struct hci_dev *hdev, u8 ps_state) !test_bit(BTNXPUART_SERDEV_OPEN, &nxpdev->tx_state)) return; + mutex_lock(&psdata->ps_lock); switch (psdata->cur_h2c_wakeupmode) { case WAKEUP_METHOD_DTR: if (ps_state == PS_STATE_AWAKE) @@ -350,12 +355,15 @@ static void ps_control(struct hci_dev *hdev, u8 ps_state) status = serdev_device_break_ctl(nxpdev->serdev, 0); else status = serdev_device_break_ctl(nxpdev->serdev, -1); + msleep(20); /* Allow chip to detect UART-break and enter sleep */ bt_dev_dbg(hdev, "Set UART break: %s, status=%d", str_on_off(ps_state == PS_STATE_SLEEP), status); break; } if (!status) psdata->ps_state = ps_state; + mutex_unlock(&psdata->ps_lock); + if (ps_state == PS_STATE_AWAKE) btnxpuart_tx_wakeup(nxpdev); } @@ -391,17 +399,25 @@ static void ps_setup(struct hci_dev *hdev) psdata->hdev = hdev; INIT_WORK(&psdata->work, ps_work_func); + mutex_init(&psdata->ps_lock); timer_setup(&psdata->ps_timer, ps_timeout_func, 0); } -static void ps_wakeup(struct btnxpuart_dev *nxpdev) +static bool ps_wakeup(struct btnxpuart_dev *nxpdev) { struct ps_data *psdata = &nxpdev->psdata; + u8 ps_state; - if (psdata->ps_state != PS_STATE_AWAKE) { + mutex_lock(&psdata->ps_lock); + ps_state = psdata->ps_state; + mutex_unlock(&psdata->ps_lock); + + if (ps_state != PS_STATE_AWAKE) { psdata->ps_cmd = PS_CMD_EXIT_PS; schedule_work(&psdata->work); + return true; } + return false; } static int send_ps_cmd(struct hci_dev *hdev, void *data) @@ -1171,7 +1187,6 @@ static struct sk_buff *nxp_dequeue(void *data) { struct btnxpuart_dev *nxpdev = (struct btnxpuart_dev *)data; - ps_wakeup(nxpdev); ps_start_timer(nxpdev); return skb_dequeue(&nxpdev->txq); } @@ -1186,6 +1201,9 @@ static void btnxpuart_tx_work(struct work_struct *work) struct sk_buff *skb; int len; + if (ps_wakeup(nxpdev)) + return; + while ((skb = nxp_dequeue(nxpdev))) { len = serdev_device_write_buf(serdev, skb->data, skb->len); hdev->stat.byte_tx += len; From 4b60f3880d23d7f5d4f632ccd4fb14e2d7f2631e Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Tue, 19 Dec 2023 01:02:28 +0900 Subject: [PATCH 076/707] btrfs: zoned: factor out prepare_allocation_zoned() Factor out prepare_allocation_zoned() for further extension. While at it, optimize the if-branch a bit. Reviewed-by: Johannes Thumshirn Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index f396aba92c5796..d260b970bec775 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4298,6 +4298,24 @@ static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info, return 0; } +static int prepare_allocation_zoned(struct btrfs_fs_info *fs_info, + struct find_free_extent_ctl *ffe_ctl) +{ + if (ffe_ctl->for_treelog) { + spin_lock(&fs_info->treelog_bg_lock); + if (fs_info->treelog_bg) + ffe_ctl->hint_byte = fs_info->treelog_bg; + spin_unlock(&fs_info->treelog_bg_lock); + } else if (ffe_ctl->for_data_reloc) { + spin_lock(&fs_info->relocation_bg_lock); + if (fs_info->data_reloc_bg) + ffe_ctl->hint_byte = fs_info->data_reloc_bg; + spin_unlock(&fs_info->relocation_bg_lock); + } + + return 0; +} + static int prepare_allocation(struct btrfs_fs_info *fs_info, struct find_free_extent_ctl *ffe_ctl, struct btrfs_space_info *space_info, @@ -4308,19 +4326,7 @@ static int prepare_allocation(struct btrfs_fs_info *fs_info, return prepare_allocation_clustered(fs_info, ffe_ctl, space_info, ins); case BTRFS_EXTENT_ALLOC_ZONED: - if (ffe_ctl->for_treelog) { - spin_lock(&fs_info->treelog_bg_lock); - if (fs_info->treelog_bg) - ffe_ctl->hint_byte = fs_info->treelog_bg; - spin_unlock(&fs_info->treelog_bg_lock); - } - if (ffe_ctl->for_data_reloc) { - spin_lock(&fs_info->relocation_bg_lock); - if (fs_info->data_reloc_bg) - ffe_ctl->hint_byte = fs_info->data_reloc_bg; - spin_unlock(&fs_info->relocation_bg_lock); - } - return 0; + return prepare_allocation_zoned(fs_info, ffe_ctl); default: BUG(); } From 62ec22633c6a748d8e7ee2e171491b155e3242a7 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Tue, 19 Dec 2023 01:02:29 +0900 Subject: [PATCH 077/707] btrfs: zoned: optimize hint byte for zoned allocator Writing sequentially to a huge file on btrfs on a SMR HDD revealed a decline of the performance (220 MiB/s to 30 MiB/s after 500 minutes). The performance goes down because of increased latency of the extent allocation, which is induced by a traversing of a lot of full block groups. So, this patch optimizes the ffe_ctl->hint_byte by choosing a block group with sufficient size from the active block group list, which does not contain full block groups. After applying the patch, the performance is maintained well. Fixes: 2eda57089ea3 ("btrfs: zoned: implement sequential extent allocation") CC: stable@vger.kernel.org # 5.15+ Reviewed-by: Johannes Thumshirn Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index d260b970bec775..6d680031211a1c 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4311,6 +4311,24 @@ static int prepare_allocation_zoned(struct btrfs_fs_info *fs_info, if (fs_info->data_reloc_bg) ffe_ctl->hint_byte = fs_info->data_reloc_bg; spin_unlock(&fs_info->relocation_bg_lock); + } else if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA) { + struct btrfs_block_group *block_group; + + spin_lock(&fs_info->zone_active_bgs_lock); + list_for_each_entry(block_group, &fs_info->zone_active_bgs, active_bg_list) { + /* + * No lock is OK here because avail is monotinically + * decreasing, and this is just a hint. + */ + u64 avail = block_group->zone_capacity - block_group->alloc_offset; + + if (block_group_bits(block_group, ffe_ctl->flags) && + avail >= ffe_ctl->num_bytes) { + ffe_ctl->hint_byte = block_group->start; + break; + } + } + spin_unlock(&fs_info->zone_active_bgs_lock); } return 0; From a5592dcde6c61257f00c78830aa6a2d044aabf43 Mon Sep 17 00:00:00 2001 From: Dmitry Antipov Date: Thu, 21 Dec 2023 11:47:45 +0300 Subject: [PATCH 078/707] btrfs: fix kvcalloc() arguments order in btrfs_ioctl_send() When compiling with gcc version 14.0.0 20231220 (experimental) and W=1, I've noticed the following warning: fs/btrfs/send.c: In function 'btrfs_ioctl_send': fs/btrfs/send.c:8208:44: warning: 'kvcalloc' sizes specified with 'sizeof' in the earlier argument and not in the later argument [-Wcalloc-transposed-args] 8208 | sctx->clone_roots = kvcalloc(sizeof(*sctx->clone_roots), | ^ Since 'n' and 'size' arguments of 'kvcalloc()' are multiplied to calculate the final size, their actual order doesn't affect the result and so this is not a bug. But it's still worth to fix it. Signed-off-by: Dmitry Antipov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/send.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 4e36550618e580..2d7519a6ce72d3 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -8205,8 +8205,8 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) goto out; } - sctx->clone_roots = kvcalloc(sizeof(*sctx->clone_roots), - arg->clone_sources_count + 1, + sctx->clone_roots = kvcalloc(arg->clone_sources_count + 1, + sizeof(*sctx->clone_roots), GFP_KERNEL); if (!sctx->clone_roots) { ret = -ENOMEM; From aa434e486204ddac5c4a6ea20e8085d482c27e01 Mon Sep 17 00:00:00 2001 From: Fedor Pchelkin Date: Wed, 3 Jan 2024 13:31:27 +0300 Subject: [PATCH 079/707] btrfs: ref-verify: free ref cache before clearing mount opt As clearing REF_VERIFY mount option indicates there were some errors in a ref-verify process, a ref cache is not relevant anymore and should be freed. btrfs_free_ref_cache() requires REF_VERIFY option being set so call it just before clearing the mount option. Found by Linux Verification Center (linuxtesting.org) with Syzkaller. Reported-by: syzbot+be14ed7728594dc8bd42@syzkaller.appspotmail.com Fixes: fd708b81d972 ("Btrfs: add a extent ref verify tool") CC: stable@vger.kernel.org # 5.4+ Closes: https://lore.kernel.org/lkml/000000000000e5a65c05ee832054@google.com/ Reported-by: syzbot+c563a3c79927971f950f@syzkaller.appspotmail.com Closes: https://lore.kernel.org/lkml/0000000000007fe09705fdc6086c@google.com/ Reviewed-by: Anand Jain Signed-off-by: Fedor Pchelkin Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ref-verify.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index 6486f0d7e9931b..8c4fc98ca9ce7d 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -889,8 +889,10 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info, out_unlock: spin_unlock(&fs_info->ref_verify_lock); out: - if (ret) + if (ret) { + btrfs_free_ref_cache(fs_info); btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY); + } return ret; } @@ -1021,8 +1023,8 @@ int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info) } } if (ret) { - btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY); btrfs_free_ref_cache(fs_info); + btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY); } btrfs_free_path(path); return ret; From 690381ca057e24c0ad192f35a28cf0ab6f009be5 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Fri, 22 Dec 2023 01:46:16 +0900 Subject: [PATCH 080/707] btrfs: fix unbalanced unlock of mapping_tree_lock The error path of btrfs_get_chunk_map() releases fs_info->mapping_tree_lock. But, it is taken and released in btrfs_find_chunk_map(). So, there is no need to do so. Fixes: 7dc66abb5a47 ("btrfs: use a dedicated data structure for chunk maps") Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 4c32497311d2ff..d67785be2c778c 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3087,7 +3087,6 @@ struct btrfs_chunk_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info, map = btrfs_find_chunk_map(fs_info, logical, length); if (unlikely(!map)) { - read_unlock(&fs_info->mapping_tree_lock); btrfs_crit(fs_info, "unable to find chunk map for logical %llu length %llu", logical, length); @@ -3095,7 +3094,6 @@ struct btrfs_chunk_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info, } if (unlikely(map->start > logical || map->start + map->chunk_len <= logical)) { - read_unlock(&fs_info->mapping_tree_lock); btrfs_crit(fs_info, "found a bad chunk map, wanted %llu-%llu, found %llu-%llu", logical, logical + length, map->start, From c31b9639579d137855cbda6fa4045b499571cc0b Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Fri, 22 Dec 2023 13:56:34 +0900 Subject: [PATCH 081/707] btrfs: zoned: fix lock ordering in btrfs_zone_activate() The btrfs CI reported a lockdep warning as follows by running generic generic/129. WARNING: possible circular locking dependency detected 6.7.0-rc5+ #1 Not tainted ------------------------------------------------------ kworker/u5:5/793427 is trying to acquire lock: ffff88813256d028 (&cache->lock){+.+.}-{2:2}, at: btrfs_zone_finish_one_bg+0x5e/0x130 but task is already holding lock: ffff88810a23a318 (&fs_info->zone_active_bgs_lock){+.+.}-{2:2}, at: btrfs_zone_finish_one_bg+0x34/0x130 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (&fs_info->zone_active_bgs_lock){+.+.}-{2:2}: ... -> #0 (&cache->lock){+.+.}-{2:2}: ... This is because we take fs_info->zone_active_bgs_lock after a block_group's lock in btrfs_zone_activate() while doing the opposite in other places. Fix the issue by expanding the fs_info->zone_active_bgs_lock's critical section and taking it before a block_group's lock. Fixes: a7e1ac7bdc5a ("btrfs: zoned: reserve zones for an active metadata/system block group") CC: stable@vger.kernel.org # 6.6 Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/zoned.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 12066afc235c31..ac9bbe0c4ffe69 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -2072,6 +2072,7 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) map = block_group->physical_map; + spin_lock(&fs_info->zone_active_bgs_lock); spin_lock(&block_group->lock); if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags)) { ret = true; @@ -2084,7 +2085,6 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) goto out_unlock; } - spin_lock(&fs_info->zone_active_bgs_lock); for (i = 0; i < map->num_stripes; i++) { struct btrfs_zoned_device_info *zinfo; int reserved = 0; @@ -2104,20 +2104,17 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) */ if (atomic_read(&zinfo->active_zones_left) <= reserved) { ret = false; - spin_unlock(&fs_info->zone_active_bgs_lock); goto out_unlock; } if (!btrfs_dev_set_active_zone(device, physical)) { /* Cannot activate the zone */ ret = false; - spin_unlock(&fs_info->zone_active_bgs_lock); goto out_unlock; } if (!is_data) zinfo->reserved_active_zones--; } - spin_unlock(&fs_info->zone_active_bgs_lock); /* Successfully activated all the zones */ set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags); @@ -2125,8 +2122,6 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) /* For the active block group list */ btrfs_get_block_group(block_group); - - spin_lock(&fs_info->zone_active_bgs_lock); list_add_tail(&block_group->active_bg_list, &fs_info->zone_active_bgs); spin_unlock(&fs_info->zone_active_bgs_lock); @@ -2134,6 +2129,7 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) out_unlock: spin_unlock(&block_group->lock); + spin_unlock(&fs_info->zone_active_bgs_lock); return ret; } From 626cef40faf0b363be5d552800adb5845774662b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jonas=20Dre=C3=9Fler?= Date: Tue, 2 Jan 2024 19:08:08 +0100 Subject: [PATCH 082/707] Bluetooth: hci_sync: Check the correct flag before starting a scan MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There's a very confusing mistake in the code starting a HCI inquiry: We're calling hci_dev_test_flag() to test for HCI_INQUIRY, but hci_dev_test_flag() checks hdev->dev_flags instead of hdev->flags. HCI_INQUIRY is a bit that's set on hdev->flags, not on hdev->dev_flags though. HCI_INQUIRY equals the integer 7, and in hdev->dev_flags, 7 means HCI_BONDABLE, so we were actually checking for HCI_BONDABLE here. The mistake is only present in the synchronous code for starting an inquiry, not in the async one. Also devices are typically bondable while doing an inquiry, so that might be the reason why nobody noticed it so far. Fixes: abfeea476c68 ("Bluetooth: hci_sync: Convert MGMT_OP_START_DISCOVERY") Signed-off-by: Jonas Dreßler Reviewed-by: Simon Horman Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_sync.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index a6fc8a2a5c673d..b3141e3f9cf620 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -5559,7 +5559,7 @@ static int hci_inquiry_sync(struct hci_dev *hdev, u8 length) bt_dev_dbg(hdev, ""); - if (hci_dev_test_flag(hdev, HCI_INQUIRY)) + if (test_bit(HCI_INQUIRY, &hdev->flags)) return 0; hci_dev_lock(hdev); From 4b85ee4187306be98113b8fa290d535ae6efa812 Mon Sep 17 00:00:00 2001 From: Ying Hsu Date: Thu, 4 Jan 2024 11:56:32 +0000 Subject: [PATCH 083/707] Bluetooth: Avoid potential use-after-free in hci_error_reset While handling the HCI_EV_HARDWARE_ERROR event, if the underlying BT controller is not responding, the GPIO reset mechanism would free the hci_dev and lead to a use-after-free in hci_error_reset. Here's the call trace observed on a ChromeOS device with Intel AX201: queue_work_on+0x3e/0x6c __hci_cmd_sync_sk+0x2ee/0x4c0 [bluetooth ] ? init_wait_entry+0x31/0x31 __hci_cmd_sync+0x16/0x20 [bluetooth ] hci_error_reset+0x4f/0xa4 [bluetooth ] process_one_work+0x1d8/0x33f worker_thread+0x21b/0x373 kthread+0x13a/0x152 ? pr_cont_work+0x54/0x54 ? kthread_blkcg+0x31/0x31 ret_from_fork+0x1f/0x30 This patch holds the reference count on the hci_dev while processing a HCI_EV_HARDWARE_ERROR event to avoid potential crash. Fixes: c7741d16a57c ("Bluetooth: Perform a power cycle when receiving hardware error event") Signed-off-by: Ying Hsu Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_core.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 65601aa52e0d8b..2821a42cefdc6e 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -1049,6 +1049,7 @@ static void hci_error_reset(struct work_struct *work) { struct hci_dev *hdev = container_of(work, struct hci_dev, error_reset); + hci_dev_hold(hdev); BT_DBG("%s", hdev->name); if (hdev->hw_error) @@ -1056,10 +1057,10 @@ static void hci_error_reset(struct work_struct *work) else bt_dev_err(hdev, "hardware error 0x%2.2x", hdev->hw_error_code); - if (hci_dev_do_close(hdev)) - return; + if (!hci_dev_do_close(hdev)) + hci_dev_do_open(hdev); - hci_dev_do_open(hdev); + hci_dev_put(hdev); } void hci_uuids_clear(struct hci_dev *hdev) From 0bcd317e8b31833d36cd9843902905aafbd70017 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Fri, 5 Jan 2024 10:43:26 -0500 Subject: [PATCH 084/707] Bluetooth: hci_sync: Fix accept_list when attempting to suspend During suspend, only wakeable devices can be in acceptlist, so if the device was previously added it needs to be removed otherwise the device can end up waking up the system prematurely. Fixes: 3b42055388c3 ("Bluetooth: hci_sync: Fix attempting to suspend with unfiltered passive scan") Signed-off-by: Clancy Shang Signed-off-by: Luiz Augusto von Dentz Reviewed-by: Paul Menzel --- net/bluetooth/hci_sync.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index b3141e3f9cf620..5716345a26dfb7 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -2206,8 +2206,11 @@ static int hci_le_add_accept_list_sync(struct hci_dev *hdev, /* During suspend, only wakeable devices can be in acceptlist */ if (hdev->suspended && - !(params->flags & HCI_CONN_FLAG_REMOTE_WAKEUP)) + !(params->flags & HCI_CONN_FLAG_REMOTE_WAKEUP)) { + hci_le_del_accept_list_sync(hdev, ¶ms->addr, + params->addr_type); return 0; + } /* Select filter policy to accept all advertising */ if (*num_entries >= hdev->le_accept_list_size) From 6ec00b0737fe9108ec8a1995e20349b91a89ce07 Mon Sep 17 00:00:00 2001 From: Yuxuan Hu <20373622@buaa.edu.cn> Date: Wed, 3 Jan 2024 17:10:43 +0800 Subject: [PATCH 085/707] Bluetooth: rfcomm: Fix null-ptr-deref in rfcomm_check_security During our fuzz testing of the connection and disconnection process at the RFCOMM layer, we discovered this bug. By comparing the packets from a normal connection and disconnection process with the testcase that triggered a KASAN report. We analyzed the cause of this bug as follows: 1. In the packets captured during a normal connection, the host sends a `Read Encryption Key Size` type of `HCI_CMD` packet (Command Opcode: 0x1408) to the controller to inquire the length of encryption key.After receiving this packet, the controller immediately replies with a Command Completepacket (Event Code: 0x0e) to return the Encryption Key Size. 2. In our fuzz test case, the timing of the controller's response to this packet was delayed to an unexpected point: after the RFCOMM and L2CAP layers had disconnected but before the HCI layer had disconnected. 3. After receiving the Encryption Key Size Response at the time described in point 2, the host still called the rfcomm_check_security function. However, by this time `struct l2cap_conn *conn = l2cap_pi(sk)->chan->conn;` had already been released, and when the function executed `return hci_conn_security(conn->hcon, d->sec_level, auth_type, d->out);`, specifically when accessing `conn->hcon`, a null-ptr-deref error occurred. To fix this bug, check if `sk->sk_state` is BT_CLOSED before calling rfcomm_recv_frame in rfcomm_process_rx. Signed-off-by: Yuxuan Hu <20373622@buaa.edu.cn> Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/rfcomm/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c index 053ef8f25fae47..1d34d849703329 100644 --- a/net/bluetooth/rfcomm/core.c +++ b/net/bluetooth/rfcomm/core.c @@ -1941,7 +1941,7 @@ static struct rfcomm_session *rfcomm_process_rx(struct rfcomm_session *s) /* Get data directly from socket receive queue without copying it. */ while ((skb = skb_dequeue(&sk->sk_receive_queue))) { skb_orphan(skb); - if (!skb_linearize(skb)) { + if (!skb_linearize(skb) && sk->sk_state != BT_CLOSED) { s = rfcomm_recv_frame(s, skb); if (!s) break; From 31f8df9e1976d1e0ae1588303d856d966d6a2847 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 8 Jan 2024 13:50:20 +1030 Subject: [PATCH 086/707] btrfs: remove the pg_offset parameter from btrfs_get_extent() The parameter @pg_offset of btrfs_get_extent() is only utilized for inlined extent, and we already have an ASSERT() and tree-checker, to make sure we can only get inline extent at file offset 0. Any invalid inline extent with non-zero file offset would be rejected by tree-checker in the first place. Thus the @pg_offset parameter is not really necessary, just remove it. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 3 +-- fs/btrfs/extent_io.c | 10 ++++----- fs/btrfs/file.c | 11 +++++----- fs/btrfs/inode.c | 16 ++++++--------- fs/btrfs/tests/inode-tests.c | 40 ++++++++++++++++++------------------ 5 files changed, 36 insertions(+), 44 deletions(-) diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 7f7c5a92d2b879..83d78a6f3aa2f3 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -490,8 +490,7 @@ struct inode *btrfs_iget_path(struct super_block *s, u64 ino, struct btrfs_root *root, struct btrfs_path *path); struct inode *btrfs_iget(struct super_block *s, u64 ino, struct btrfs_root *root); struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, - struct page *page, size_t pg_offset, - u64 start, u64 len); + struct page *page, u64 start, u64 len); int btrfs_update_inode(struct btrfs_trans_handle *trans, struct btrfs_inode *inode); int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index a0ffd41c5cc19f..0ea8c401e3a88a 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -970,8 +970,7 @@ void clear_page_extent_mapped(struct page *page) folio_detach_private(folio); } -static struct extent_map * -__get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, +static struct extent_map *__get_extent_map(struct inode *inode, struct page *page, u64 start, u64 len, struct extent_map **em_cached) { struct extent_map *em; @@ -988,7 +987,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, *em_cached = NULL; } - em = btrfs_get_extent(BTRFS_I(inode), page, pg_offset, start, len); + em = btrfs_get_extent(BTRFS_I(inode), page, start, len); if (em_cached && !IS_ERR(em)) { BUG_ON(*em_cached); refcount_inc(&em->refs); @@ -1051,8 +1050,7 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, end_page_read(page, true, cur, iosize); break; } - em = __get_extent_map(inode, page, pg_offset, cur, - end - cur + 1, em_cached); + em = __get_extent_map(inode, page, cur, end - cur + 1, em_cached); if (IS_ERR(em)) { unlock_extent(tree, cur, end, NULL); end_page_read(page, false, cur, end + 1 - cur); @@ -1371,7 +1369,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, continue; } - em = btrfs_get_extent(inode, NULL, 0, cur, len); + em = btrfs_get_extent(inode, NULL, cur, len); if (IS_ERR(em)) { ret = PTR_ERR_OR_ZERO(em); goto out_error; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 38dfcac4760990..f8e1a7ce3d39ae 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2176,7 +2176,7 @@ static int find_first_non_hole(struct btrfs_inode *inode, u64 *start, u64 *len) struct extent_map *em; int ret = 0; - em = btrfs_get_extent(inode, NULL, 0, + em = btrfs_get_extent(inode, NULL, round_down(*start, fs_info->sectorsize), round_up(*len, fs_info->sectorsize)); if (IS_ERR(em)) @@ -2835,7 +2835,7 @@ static int btrfs_zero_range_check_range_boundary(struct btrfs_inode *inode, int ret; offset = round_down(offset, sectorsize); - em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize); + em = btrfs_get_extent(inode, NULL, offset, sectorsize); if (IS_ERR(em)) return PTR_ERR(em); @@ -2866,7 +2866,7 @@ static int btrfs_zero_range(struct inode *inode, u64 bytes_to_reserve = 0; bool space_reserved = false; - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, alloc_start, + em = btrfs_get_extent(BTRFS_I(inode), NULL, alloc_start, alloc_end - alloc_start); if (IS_ERR(em)) { ret = PTR_ERR(em); @@ -2909,8 +2909,7 @@ static int btrfs_zero_range(struct inode *inode, if (BTRFS_BYTES_TO_BLKS(fs_info, offset) == BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)) { - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, alloc_start, - sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, alloc_start, sectorsize); if (IS_ERR(em)) { ret = PTR_ERR(em); goto out; @@ -3126,7 +3125,7 @@ static long btrfs_fallocate(struct file *file, int mode, /* First, check if we exceed the qgroup limit */ while (cur_offset < alloc_end) { - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset, + em = btrfs_get_extent(BTRFS_I(inode), NULL, cur_offset, alloc_end - cur_offset); if (IS_ERR(em)) { ret = PTR_ERR(em); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index b3e39610cc95a3..1002ebde14da20 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2632,7 +2632,7 @@ static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode, u64 em_len; int ret = 0; - em = btrfs_get_extent(inode, NULL, 0, search_start, search_len); + em = btrfs_get_extent(inode, NULL, search_start, search_len); if (IS_ERR(em)) return PTR_ERR(em); @@ -4888,8 +4888,7 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size) &cached_state); cur_offset = hole_start; while (1) { - em = btrfs_get_extent(inode, NULL, 0, cur_offset, - block_end - cur_offset); + em = btrfs_get_extent(inode, NULL, cur_offset, block_end - cur_offset); if (IS_ERR(em)) { err = PTR_ERR(em); em = NULL; @@ -6737,7 +6736,6 @@ static int read_inline_extent(struct btrfs_inode *inode, struct btrfs_path *path * * @inode: file to search in * @page: page to read extent data into if the extent is inline - * @pg_offset: offset into @page to copy to * @start: file offset * @len: length of range starting at @start * @@ -6751,8 +6749,7 @@ static int read_inline_extent(struct btrfs_inode *inode, struct btrfs_path *path * Return: ERR_PTR on error, non-NULL extent_map on success. */ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, - struct page *page, size_t pg_offset, - u64 start, u64 len) + struct page *page, u64 start, u64 len) { struct btrfs_fs_info *fs_info = inode->root->fs_info; int ret = 0; @@ -6895,7 +6892,6 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, * ensured by tree-checker and inline extent creation path. * Thus all members representing file offsets should be zero. */ - ASSERT(pg_offset == 0); ASSERT(extent_start == 0); ASSERT(em->start == 0); @@ -7536,7 +7532,7 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, if (ret < 0) goto err; - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len); + em = btrfs_get_extent(BTRFS_I(inode), NULL, start, len); if (IS_ERR(em)) { ret = PTR_ERR(em); goto unlock_err; @@ -10125,7 +10121,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, cond_resched(); } - em = btrfs_get_extent(inode, NULL, 0, start, lockend - start + 1); + em = btrfs_get_extent(inode, NULL, start, lockend - start + 1); if (IS_ERR(em)) { ret = PTR_ERR(em); goto out_unlock_extent; @@ -10698,7 +10694,7 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, struct btrfs_block_group *bg; u64 len = isize - start; - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len); + em = btrfs_get_extent(BTRFS_I(inode), NULL, start, len); if (IS_ERR(em)) { ret = PTR_ERR(em); goto out; diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index 9957de9f7806d1..99da9d34b77aed 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -258,7 +258,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) /* First with no extents */ BTRFS_I(inode)->root = root; - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, sectorsize); if (IS_ERR(em)) { em = NULL; test_err("got an error when we shouldn't have"); @@ -278,7 +278,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) */ setup_file_extents(root, sectorsize); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, (u64)-1); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, (u64)-1); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -316,7 +316,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -339,7 +339,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) free_extent_map(em); /* Regular extent */ - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -367,7 +367,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) free_extent_map(em); /* The next 3 are split extents */ - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -396,7 +396,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -418,7 +418,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -452,7 +452,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) free_extent_map(em); /* Prealloc extent */ - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -481,7 +481,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) free_extent_map(em); /* The next 3 are a half written prealloc extent */ - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -511,7 +511,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -544,7 +544,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -579,7 +579,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) free_extent_map(em); /* Now for the compressed extent */ - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -613,7 +613,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) free_extent_map(em); /* Split compressed extent */ - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -648,7 +648,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -675,7 +675,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -710,7 +710,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) free_extent_map(em); /* A hole between regular extents but no hole extent */ - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset + 6, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset + 6, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -737,7 +737,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, SZ_4M); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, SZ_4M); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -770,7 +770,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -850,7 +850,7 @@ static int test_hole_first(u32 sectorsize, u32 nodesize) insert_inode_item_key(root); insert_extent(root, sectorsize, sectorsize, sectorsize, 0, sectorsize, sectorsize, BTRFS_FILE_EXTENT_REG, 0, 1); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, 2 * sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 2 * sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -872,7 +872,7 @@ static int test_hole_first(u32 sectorsize, u32 nodesize) } free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, sectorsize, 2 * sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, sectorsize, 2 * sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; From f48705f473cea37efeeaa6a197ae12730c112863 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jonas=20Dre=C3=9Fler?= Date: Sun, 7 Jan 2024 19:02:47 +0100 Subject: [PATCH 087/707] Bluetooth: Remove HCI_POWER_OFF_TIMEOUT MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With commit cf75ad8b41d2 ("Bluetooth: hci_sync: Convert MGMT_SET_POWERED"), the power off sequence got refactored so that this timeout was no longer necessary, let's remove the leftover define from the header too. Fixes: cf75ad8b41d2 ("Bluetooth: hci_sync: Convert MGMT_SET_POWERED") Signed-off-by: Jonas Dreßler Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index bdee5d649cc61d..f7918c7551834b 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -437,7 +437,6 @@ enum { #define HCI_NCMD_TIMEOUT msecs_to_jiffies(4000) /* 4 seconds */ #define HCI_ACL_TX_TIMEOUT msecs_to_jiffies(45000) /* 45 seconds */ #define HCI_AUTO_OFF_TIMEOUT msecs_to_jiffies(2000) /* 2 seconds */ -#define HCI_POWER_OFF_TIMEOUT msecs_to_jiffies(5000) /* 5 seconds */ #define HCI_LE_CONN_TIMEOUT msecs_to_jiffies(20000) /* 20 seconds */ #define HCI_LE_AUTOCONN_TIMEOUT msecs_to_jiffies(4000) /* 4 seconds */ From 2e7a6a997c9a9e5a7d15c09f22d0add5672c0906 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jonas=20Dre=C3=9Fler?= Date: Sun, 7 Jan 2024 19:02:48 +0100 Subject: [PATCH 088/707] Bluetooth: mgmt: Remove leftover queuing of power_off work MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Queuing of power_off work was introduced in these functions with commits 8b064a3ad377 ("Bluetooth: Clean up HCI state when doing power off") and c9910d0fb4fc ("Bluetooth: Fix disconnecting connections in non-connected states") in an effort to clean up state and do things like disconnecting devices before actually powering off the device. After that, commit a3172b7eb4a2 ("Bluetooth: Add timer to force power off") introduced a timeout to ensure that the device actually got powered off, even if some of the cleanup work would never complete. This code later got refactored with commit cf75ad8b41d2 ("Bluetooth: hci_sync: Convert MGMT_SET_POWERED"), which made powering off the device synchronous and removed the need for initiating the power_off work from other places. The timeout mentioned above got removed too, because we now also made use of the command timeout during power on/off. These days the power_off work still exists, but it only seems to only be used for HCI_AUTO_OFF functionality, which is why we never noticed those two leftover places where we queue power_off work. So let's remove that code. Fixes: cf75ad8b41d2 ("Bluetooth: hci_sync: Convert MGMT_SET_POWERED") Signed-off-by: Jonas Dreßler Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/mgmt.c | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index bb72ff6eb22f4b..d1c55e409659f0 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -9764,14 +9764,6 @@ void mgmt_device_disconnected(struct hci_dev *hdev, bdaddr_t *bdaddr, struct mgmt_ev_device_disconnected ev; struct sock *sk = NULL; - /* The connection is still in hci_conn_hash so test for 1 - * instead of 0 to know if this is the last one. - */ - if (mgmt_powering_down(hdev) && hci_conn_count(hdev) == 1) { - cancel_delayed_work(&hdev->power_off); - queue_work(hdev->req_workqueue, &hdev->power_off.work); - } - if (!mgmt_connected) return; @@ -9828,14 +9820,6 @@ void mgmt_connect_failed(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type, { struct mgmt_ev_connect_failed ev; - /* The connection is still in hci_conn_hash so test for 1 - * instead of 0 to know if this is the last one. - */ - if (mgmt_powering_down(hdev) && hci_conn_count(hdev) == 1) { - cancel_delayed_work(&hdev->power_off); - queue_work(hdev->req_workqueue, &hdev->power_off.work); - } - bacpy(&ev.addr.bdaddr, bdaddr); ev.addr.type = link_to_bdaddr(link_type, addr_type); ev.status = mgmt_status(status); From 18035d4098a0129131f3f5dcd0d3e233fe647e10 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 8 Jan 2024 12:30:44 +1030 Subject: [PATCH 089/707] btrfs: remove unused variable bio_offset from end_bbio_data_read() The variable @bio_offset was introduced in commit 7ffd27e378d2 ("btrfs: pass bio_offset to check_data_csum() directly"), when we are still using the same endio function for both data and metadata. Later we had several changes to data and metadata endio functions: - Data verification is handled by btrfs bio layer - Split data and metadata endio paths Now for data path we no longer do any verification in end_bbio_data_read(), as the verification is handled by btrfs bio layer already. Thus there is no need for such bio_offset variable. Reviewed-by: Anand Jain Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 0ea8c401e3a88a..3dc387edc164a9 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -596,11 +596,6 @@ static void end_bbio_data_read(struct btrfs_bio *bbio) struct bio *bio = &bbio->bio; struct processed_extent processed = { 0 }; struct folio_iter fi; - /* - * The offset to the beginning of a bio, since one bio can never be - * larger than UINT_MAX, u32 here is enough. - */ - u32 bio_offset = 0; ASSERT(!bio_flagged(bio, BIO_CLONED)); bio_for_each_folio_all(fi, &bbio->bio) { @@ -667,10 +662,6 @@ static void end_bbio_data_read(struct btrfs_bio *bbio) end_page_read(folio_page(folio, 0), uptodate, start, len); endio_readpage_release_extent(&processed, BTRFS_I(inode), start, end, uptodate); - - ASSERT(bio_offset + len > bio_offset); - bio_offset += len; - } /* Release the last extent */ endio_readpage_release_extent(&processed, NULL, 0, 0, false); From 08236d11031b9001d2223d708094e5e4882a9ddc Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 5 Jan 2024 16:05:55 +1030 Subject: [PATCH 090/707] btrfs: cache folio size and shift in extent_buffer After the conversion to folio interfaces (but without the patch to enable larger folio allocation), there is a LTP report about observable performance drop on metadata heavy operations. This drop is caused by the extra code of calculating the folio_size()/folio_shift(), instead of the old hard coded PAGE_SIZE/PAGE_SHIFT. To slightly reduce the overhead, just cache both folio_size and folio_shift in extent_buffer. The two new members (u32 folio_size and u8 folio_shift) is stored inside the holes of extent_buffer. (folio_size is shared with len, which is reduced to u32). Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/accessors.c | 12 ++++++------ fs/btrfs/ctree.c | 2 +- fs/btrfs/disk-io.c | 2 +- fs/btrfs/extent_io.c | 38 +++++++++++++++++++++----------------- fs/btrfs/extent_io.h | 16 +++++++++++++--- 5 files changed, 42 insertions(+), 28 deletions(-) diff --git a/fs/btrfs/accessors.c b/fs/btrfs/accessors.c index 1925a0919ca62f..6eb850ad37d2ae 100644 --- a/fs/btrfs/accessors.c +++ b/fs/btrfs/accessors.c @@ -63,8 +63,8 @@ u##bits btrfs_get_token_##bits(struct btrfs_map_token *token, \ const unsigned long idx = get_eb_folio_index(token->eb, member_offset); \ const unsigned long oil = get_eb_offset_in_folio(token->eb, \ member_offset);\ - const int unit_size = folio_size(token->eb->folios[0]); \ - const int unit_shift = folio_shift(token->eb->folios[0]); \ + const int unit_size = token->eb->folio_size; \ + const int unit_shift = token->eb->folio_shift; \ const int size = sizeof(u##bits); \ u8 lebytes[sizeof(u##bits)]; \ const int part = unit_size - oil; \ @@ -94,7 +94,7 @@ u##bits btrfs_get_##bits(const struct extent_buffer *eb, \ const unsigned long idx = get_eb_folio_index(eb, member_offset);\ const unsigned long oil = get_eb_offset_in_folio(eb, \ member_offset);\ - const int unit_size = folio_size(eb->folios[0]); \ + const int unit_size = eb->folio_size; \ char *kaddr = folio_address(eb->folios[idx]); \ const int size = sizeof(u##bits); \ const int part = unit_size - oil; \ @@ -117,8 +117,8 @@ void btrfs_set_token_##bits(struct btrfs_map_token *token, \ const unsigned long idx = get_eb_folio_index(token->eb, member_offset); \ const unsigned long oil = get_eb_offset_in_folio(token->eb, \ member_offset);\ - const int unit_size = folio_size(token->eb->folios[0]); \ - const int unit_shift = folio_shift(token->eb->folios[0]); \ + const int unit_size = token->eb->folio_size; \ + const int unit_shift = token->eb->folio_shift; \ const int size = sizeof(u##bits); \ u8 lebytes[sizeof(u##bits)]; \ const int part = unit_size - oil; \ @@ -151,7 +151,7 @@ void btrfs_set_##bits(const struct extent_buffer *eb, void *ptr, \ const unsigned long idx = get_eb_folio_index(eb, member_offset);\ const unsigned long oil = get_eb_offset_in_folio(eb, \ member_offset);\ - const int unit_size = folio_size(eb->folios[0]); \ + const int unit_size = eb->folio_size; \ char *kaddr = folio_address(eb->folios[idx]); \ const int size = sizeof(u##bits); \ const int part = unit_size - oil; \ diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index e65e012bac5531..33145da449cc8d 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -820,7 +820,7 @@ int btrfs_bin_search(struct extent_buffer *eb, int first_slot, } while (low < high) { - const int unit_size = folio_size(eb->folios[0]); + const int unit_size = eb->folio_size; unsigned long oil; unsigned long offset; struct btrfs_disk_key *tmp; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index c6907d533fe839..57be7dd44da5db 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -193,7 +193,7 @@ static int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, struct folio *folio = eb->folios[i]; u64 start = max_t(u64, eb->start, folio_pos(folio)); u64 end = min_t(u64, eb->start + eb->len, - folio_pos(folio) + folio_size(folio)); + folio_pos(folio) + eb->folio_size); u32 len = end - start; ret = btrfs_repair_io_failure(fs_info, 0, start, len, diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 3dc387edc164a9..c8aabe3be16978 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -78,7 +78,7 @@ void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info) eb = list_first_entry(&fs_info->allocated_ebs, struct extent_buffer, leak_list); pr_err( - "BTRFS: buffer leak start %llu len %lu refs %d bflags %lu owner %llu\n", + "BTRFS: buffer leak start %llu len %u refs %d bflags %lu owner %llu\n", eb->start, eb->len, atomic_read(&eb->refs), eb->bflags, btrfs_header_owner(eb)); list_del(&eb->leak_list); @@ -729,6 +729,8 @@ static int alloc_eb_folio_array(struct extent_buffer *eb, gfp_t extra_gfp) for (int i = 0; i < num_pages; i++) eb->folios[i] = page_folio(page_array[i]); + eb->folio_size = PAGE_SIZE; + eb->folio_shift = PAGE_SHIFT; return 0; } @@ -1728,10 +1730,10 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb, folio_lock(folio); folio_clear_dirty_for_io(folio); folio_start_writeback(folio); - ret = bio_add_folio(&bbio->bio, folio, folio_size(folio), 0); + ret = bio_add_folio(&bbio->bio, folio, eb->folio_size, 0); ASSERT(ret); wbc_account_cgroup_owner(wbc, folio_page(folio, 0), - folio_size(folio)); + eb->folio_size); wbc->nr_to_write -= folio_nr_pages(folio); folio_unlock(folio); } @@ -3523,7 +3525,7 @@ static int attach_eb_folio_to_filemap(struct extent_buffer *eb, int i, /* For now, we should only have single-page folios for btree inode. */ ASSERT(folio_nr_pages(existing_folio) == 1); - if (folio_size(existing_folio) != folio_size(eb->folios[0])) { + if (folio_size(existing_folio) != eb->folio_size) { folio_unlock(existing_folio); folio_put(existing_folio); return -EAGAIN; @@ -3666,6 +3668,8 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, * and free the allocated page. */ folio = eb->folios[i]; + eb->folio_size = folio_size(folio); + eb->folio_shift = folio_shift(folio); spin_lock(&mapping->private_lock); /* Should not fail, as we have preallocated the memory */ ret = attach_extent_buffer_folio(eb, folio, prealloc); @@ -4115,7 +4119,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num, for (int i = 0; i < num_folios; i++) { struct folio *folio = eb->folios[i]; - ret = bio_add_folio(&bbio->bio, folio, folio_size(folio), 0); + ret = bio_add_folio(&bbio->bio, folio, eb->folio_size, 0); ASSERT(ret); } } @@ -4135,7 +4139,7 @@ static bool report_eb_range(const struct extent_buffer *eb, unsigned long start, unsigned long len) { btrfs_warn(eb->fs_info, - "access to eb bytenr %llu len %lu out of range start %lu len %lu", + "access to eb bytenr %llu len %u out of range start %lu len %lu", eb->start, eb->len, start, len); WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); @@ -4164,7 +4168,7 @@ static inline int check_eb_range(const struct extent_buffer *eb, void read_extent_buffer(const struct extent_buffer *eb, void *dstv, unsigned long start, unsigned long len) { - const int unit_size = folio_size(eb->folios[0]); + const int unit_size = eb->folio_size; size_t cur; size_t offset; char *dst = (char *)dstv; @@ -4204,7 +4208,7 @@ int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb, void __user *dstv, unsigned long start, unsigned long len) { - const int unit_size = folio_size(eb->folios[0]); + const int unit_size = eb->folio_size; size_t cur; size_t offset; char __user *dst = (char __user *)dstv; @@ -4244,7 +4248,7 @@ int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb, int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv, unsigned long start, unsigned long len) { - const int unit_size = folio_size(eb->folios[0]); + const int unit_size = eb->folio_size; size_t cur; size_t offset; char *kaddr; @@ -4315,7 +4319,7 @@ static void __write_extent_buffer(const struct extent_buffer *eb, const void *srcv, unsigned long start, unsigned long len, bool use_memmove) { - const int unit_size = folio_size(eb->folios[0]); + const int unit_size = eb->folio_size; size_t cur; size_t offset; char *kaddr; @@ -4364,7 +4368,7 @@ void write_extent_buffer(const struct extent_buffer *eb, const void *srcv, static void memset_extent_buffer(const struct extent_buffer *eb, int c, unsigned long start, unsigned long len) { - const int unit_size = folio_size(eb->folios[0]); + const int unit_size = eb->folio_size; unsigned long cur = start; if (eb->addr) { @@ -4395,7 +4399,7 @@ void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start, void copy_extent_buffer_full(const struct extent_buffer *dst, const struct extent_buffer *src) { - const int unit_size = folio_size(src->folios[0]); + const int unit_size = src->folio_size; unsigned long cur = 0; ASSERT(dst->len == src->len); @@ -4417,7 +4421,7 @@ void copy_extent_buffer(const struct extent_buffer *dst, unsigned long dst_offset, unsigned long src_offset, unsigned long len) { - const int unit_size = folio_size(dst->folios[0]); + const int unit_size = dst->folio_size; u64 dst_len = dst->len; size_t cur; size_t offset; @@ -4473,10 +4477,10 @@ static inline void eb_bitmap_offset(const struct extent_buffer *eb, * the bitmap item in the extent buffer + the offset of the byte in the * bitmap item. */ - offset = start + offset_in_folio(eb->folios[0], eb->start) + byte_offset; + offset = start + offset_in_eb_folio(eb, eb->start) + byte_offset; - *folio_index = offset >> folio_shift(eb->folios[0]); - *folio_offset = offset_in_folio(eb->folios[0], offset); + *folio_index = offset >> eb->folio_shift; + *folio_offset = offset_in_eb_folio(eb, offset); } /* @@ -4590,7 +4594,7 @@ void memcpy_extent_buffer(const struct extent_buffer *dst, unsigned long dst_offset, unsigned long src_offset, unsigned long len) { - const int unit_size = folio_size(dst->folios[0]); + const int unit_size = dst->folio_size; unsigned long cur_off = 0; if (check_eb_range(dst, dst_offset, len) || diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 46050500529bff..8e5639597800a7 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -8,6 +8,7 @@ #include #include #include "compression.h" +#include "messages.h" #include "ulist.h" #include "misc.h" @@ -75,7 +76,8 @@ void __cold extent_buffer_free_cachep(void); #define INLINE_EXTENT_BUFFER_PAGES (BTRFS_MAX_METADATA_BLOCKSIZE / PAGE_SIZE) struct extent_buffer { u64 start; - unsigned long len; + u32 len; + u32 folio_size; unsigned long bflags; struct btrfs_fs_info *fs_info; @@ -90,6 +92,7 @@ struct extent_buffer { int read_mirror; /* >= 0 if eb belongs to a log tree, -1 otherwise */ s8 log_index; + u8 folio_shift; struct rcu_head rcu_head; struct rw_semaphore lock; @@ -113,6 +116,13 @@ struct btrfs_eb_write_context { struct btrfs_block_group *zoned_bg; }; +static inline unsigned long offset_in_eb_folio(const struct extent_buffer *eb, + u64 start) +{ + ASSERT(eb->folio_size); + return start & (eb->folio_size - 1); +} + /* * Get the correct offset inside the page of extent buffer. * @@ -151,13 +161,13 @@ static inline unsigned long get_eb_folio_index(const struct extent_buffer *eb, * the folio_shift would be large enough to always make us * return 0 as index. * 1.2) Several page sized folios - * The folio_shift() would be PAGE_SHIFT, giving us the correct + * The folio_shift would be PAGE_SHIFT, giving us the correct * index. * * 2) sectorsize < PAGE_SIZE and nodesize < PAGE_SIZE case * The folio would only be page sized, and always give us 0 as index. */ - return offset >> folio_shift(eb->folios[0]); + return offset >> eb->folio_shift; } /* From 2b16c80d801185e748b7e3f7ff9e842bb9fb4267 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jonas=20Dre=C3=9Fler?= Date: Sun, 7 Jan 2024 19:02:49 +0100 Subject: [PATCH 091/707] Bluetooth: Add new state HCI_POWERING_DOWN MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a new state HCI_POWERING_DOWN that indicates that the device is currently powering down, this will be useful for the next commit. Signed-off-by: Jonas Dreßler Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci.h | 1 + net/bluetooth/hci_sync.c | 16 +++++++++++----- net/bluetooth/mgmt.c | 14 ++++++++++++++ 3 files changed, 26 insertions(+), 5 deletions(-) diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index f7918c7551834b..a94a8491ec7a1a 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -372,6 +372,7 @@ enum { HCI_SETUP, HCI_CONFIG, HCI_DEBUGFS_CREATED, + HCI_POWERING_DOWN, HCI_AUTO_OFF, HCI_RFKILLED, HCI_MGMT, diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index 5716345a26dfb7..b146562a65fc40 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -5403,27 +5403,33 @@ static int hci_power_off_sync(struct hci_dev *hdev) if (!test_bit(HCI_UP, &hdev->flags)) return 0; + hci_dev_set_flag(hdev, HCI_POWERING_DOWN); + if (test_bit(HCI_ISCAN, &hdev->flags) || test_bit(HCI_PSCAN, &hdev->flags)) { err = hci_write_scan_enable_sync(hdev, 0x00); if (err) - return err; + goto out; } err = hci_clear_adv_sync(hdev, NULL, false); if (err) - return err; + goto out; err = hci_stop_discovery_sync(hdev); if (err) - return err; + goto out; /* Terminated due to Power Off */ err = hci_disconnect_all_sync(hdev, HCI_ERROR_REMOTE_POWER_OFF); if (err) - return err; + goto out; + + err = hci_dev_close_sync(hdev); - return hci_dev_close_sync(hdev); +out: + hci_dev_clear_flag(hdev, HCI_POWERING_DOWN); + return err; } int hci_set_powered_sync(struct hci_dev *hdev, u8 val) diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index d1c55e409659f0..cabc5466401754 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -1388,6 +1388,14 @@ static int set_powered(struct sock *sk, struct hci_dev *hdev, void *data, hci_dev_lock(hdev); + if (!cp->val) { + if (hci_dev_test_flag(hdev, HCI_POWERING_DOWN)) { + err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_POWERED, + MGMT_STATUS_BUSY); + goto failed; + } + } + if (pending_find(MGMT_OP_SET_POWERED, hdev)) { err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_POWERED, MGMT_STATUS_BUSY); @@ -9746,6 +9754,9 @@ bool mgmt_powering_down(struct hci_dev *hdev) struct mgmt_pending_cmd *cmd; struct mgmt_mode *cp; + if (hci_dev_test_flag(hdev, HCI_POWERING_DOWN)) + return true; + cmd = pending_find(MGMT_OP_SET_POWERED, hdev); if (!cmd) return false; @@ -10053,6 +10064,9 @@ void mgmt_set_local_name_complete(struct hci_dev *hdev, u8 *name, u8 status) /* If this is a HCI command related to powering on the * HCI dev don't send any mgmt signals. */ + if (hci_dev_test_flag(hdev, HCI_POWERING_DOWN)) + return; + if (pending_find(MGMT_OP_SET_POWERED, hdev)) return; } From 088656165c2d9c9236b64ab45c51f509f22b1eee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jonas=20Dre=C3=9Fler?= Date: Sun, 7 Jan 2024 19:02:50 +0100 Subject: [PATCH 092/707] Bluetooth: Disconnect connected devices before rfkilling adapter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On a lot of platforms (at least the MS Surface devices, M1 macbooks, and a few ThinkPads) firmware doesn't do its job when rfkilling a device and the bluetooth adapter is not actually shut down properly on rfkill. This leads to connected devices remaining in connected state and the bluetooth connection eventually timing out after rfkilling an adapter. Use the rfkill hook in the HCI driver to go through the full power-off sequence (including stopping scans and disconnecting devices) before rfkilling it, just like MGMT_OP_SET_POWERED would do. In case anything during the larger power-off sequence fails, make sure the device is still closed and the rfkill ends up being effective in the end. Signed-off-by: Jonas Dreßler Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_core.c | 35 +++++++++++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 2821a42cefdc6e..e5cb618fa6d39c 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -940,20 +940,51 @@ int hci_get_dev_info(void __user *arg) /* ---- Interface to HCI drivers ---- */ +static int hci_dev_do_poweroff(struct hci_dev *hdev) +{ + int err; + + BT_DBG("%s %p", hdev->name, hdev); + + hci_req_sync_lock(hdev); + + err = hci_set_powered_sync(hdev, false); + + hci_req_sync_unlock(hdev); + + return err; +} + static int hci_rfkill_set_block(void *data, bool blocked) { struct hci_dev *hdev = data; + int err; BT_DBG("%p name %s blocked %d", hdev, hdev->name, blocked); if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) return -EBUSY; + if (blocked == hci_dev_test_flag(hdev, HCI_RFKILLED)) + return 0; + if (blocked) { hci_dev_set_flag(hdev, HCI_RFKILLED); + if (!hci_dev_test_flag(hdev, HCI_SETUP) && - !hci_dev_test_flag(hdev, HCI_CONFIG)) - hci_dev_do_close(hdev); + !hci_dev_test_flag(hdev, HCI_CONFIG)) { + err = hci_dev_do_poweroff(hdev); + if (err) { + bt_dev_err(hdev, "Error when powering off device on rfkill (%d)", + err); + + /* Make sure the device is still closed even if + * anything during power off sequence (eg. + * disconnecting devices) failed. + */ + hci_dev_do_close(hdev); + } + } } else { hci_dev_clear_flag(hdev, HCI_RFKILLED); } From de4b77527d36ac19ef88e518e4ae7148cc8cc210 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Thu, 4 Jan 2024 11:48:46 -0800 Subject: [PATCH 093/707] btrfs: don't abort filesystem when attempting to snapshot deleted subvolume If the source file descriptor to the snapshot ioctl refers to a deleted subvolume, we get the following abort: BTRFS: Transaction aborted (error -2) WARNING: CPU: 0 PID: 833 at fs/btrfs/transaction.c:1875 create_pending_snapshot+0x1040/0x1190 [btrfs] Modules linked in: pata_acpi btrfs ata_piix libata scsi_mod virtio_net blake2b_generic xor net_failover virtio_rng failover scsi_common rng_core raid6_pq libcrc32c CPU: 0 PID: 833 Comm: t_snapshot_dele Not tainted 6.7.0-rc6 #2 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.3-1.fc39 04/01/2014 RIP: 0010:create_pending_snapshot+0x1040/0x1190 [btrfs] RSP: 0018:ffffa09c01337af8 EFLAGS: 00010282 RAX: 0000000000000000 RBX: ffff9982053e7c78 RCX: 0000000000000027 RDX: ffff99827dc20848 RSI: 0000000000000001 RDI: ffff99827dc20840 RBP: ffffa09c01337c00 R08: 0000000000000000 R09: ffffa09c01337998 R10: 0000000000000003 R11: ffffffffb96da248 R12: fffffffffffffffe R13: ffff99820535bb28 R14: ffff99820b7bd000 R15: ffff99820381ea80 FS: 00007fe20aadabc0(0000) GS:ffff99827dc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000559a120b502f CR3: 00000000055b6000 CR4: 00000000000006f0 Call Trace: ? create_pending_snapshot+0x1040/0x1190 [btrfs] ? __warn+0x81/0x130 ? create_pending_snapshot+0x1040/0x1190 [btrfs] ? report_bug+0x171/0x1a0 ? handle_bug+0x3a/0x70 ? exc_invalid_op+0x17/0x70 ? asm_exc_invalid_op+0x1a/0x20 ? create_pending_snapshot+0x1040/0x1190 [btrfs] ? create_pending_snapshot+0x1040/0x1190 [btrfs] create_pending_snapshots+0x92/0xc0 [btrfs] btrfs_commit_transaction+0x66b/0xf40 [btrfs] btrfs_mksubvol+0x301/0x4d0 [btrfs] btrfs_mksnapshot+0x80/0xb0 [btrfs] __btrfs_ioctl_snap_create+0x1c2/0x1d0 [btrfs] btrfs_ioctl_snap_create_v2+0xc4/0x150 [btrfs] btrfs_ioctl+0x8a6/0x2650 [btrfs] ? kmem_cache_free+0x22/0x340 ? do_sys_openat2+0x97/0xe0 __x64_sys_ioctl+0x97/0xd0 do_syscall_64+0x46/0xf0 entry_SYSCALL_64_after_hwframe+0x6e/0x76 RIP: 0033:0x7fe20abe83af RSP: 002b:00007ffe6eff1360 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 RAX: ffffffffffffffda RBX: 0000000000000004 RCX: 00007fe20abe83af RDX: 00007ffe6eff23c0 RSI: 0000000050009417 RDI: 0000000000000003 RBP: 0000000000000003 R08: 0000000000000000 R09: 00007fe20ad16cd0 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 R13: 00007ffe6eff13c0 R14: 00007fe20ad45000 R15: 0000559a120b6d58 ---[ end trace 0000000000000000 ]--- BTRFS: error (device vdc: state A) in create_pending_snapshot:1875: errno=-2 No such entry BTRFS info (device vdc: state EA): forced readonly BTRFS warning (device vdc: state EA): Skipping commit of aborted transaction. BTRFS: error (device vdc: state EA) in cleanup_transaction:2055: errno=-2 No such entry This happens because create_pending_snapshot() initializes the new root item as a copy of the source root item. This includes the refs field, which is 0 for a deleted subvolume. The call to btrfs_insert_root() therefore inserts a root with refs == 0. btrfs_get_new_fs_root() then finds the root and returns -ENOENT if refs == 0, which causes create_pending_snapshot() to abort. Fix it by checking the source root's refs before attempting the snapshot, but after locking subvol_sem to avoid racing with deletion. CC: stable@vger.kernel.org # 4.14+ Reviewed-by: Sweet Tea Dorminy Reviewed-by: Anand Jain Signed-off-by: Omar Sandoval Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index a1743904202b78..0af214c8bef4d2 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -790,6 +790,9 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, return -EOPNOTSUPP; } + if (btrfs_root_refs(&root->root_item) == 0) + return -ENOENT; + if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) return -EINVAL; From 92dd093f23d224e2317e47376ea2796c79fa969f Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Thu, 4 Jan 2024 11:48:47 -0800 Subject: [PATCH 094/707] btrfs: avoid copying BTRFS_ROOT_SUBVOL_DEAD flag to snapshot of subvolume being deleted Sweet Tea spotted a race between subvolume deletion and snapshotting that can result in the root item for the snapshot having the BTRFS_ROOT_SUBVOL_DEAD flag set. The race is: Thread 1 | Thread 2 ----------------------------------------------|---------- btrfs_delete_subvolume | btrfs_set_root_flags(BTRFS_ROOT_SUBVOL_DEAD)| |btrfs_mksubvol | down_read(subvol_sem) | create_snapshot | ... | create_pending_snapshot | copy root item from source down_write(subvol_sem) | This flag is only checked in send and swap activate, which this would cause to fail mysteriously. create_snapshot() now checks the root refs to reject a deleted subvolume, so we can fix this by locking subvol_sem earlier so that the BTRFS_ROOT_SUBVOL_DEAD flag and the root refs are updated atomically. CC: stable@vger.kernel.org # 4.14+ Reported-by: Sweet Tea Dorminy Reviewed-by: Sweet Tea Dorminy Reviewed-by: Anand Jain Signed-off-by: Omar Sandoval Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 1002ebde14da20..e285ddbcdee04f 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4458,6 +4458,8 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) u64 root_flags; int ret; + down_write(&fs_info->subvol_sem); + /* * Don't allow to delete a subvolume with send in progress. This is * inside the inode lock so the error handling that has to drop the bit @@ -4469,25 +4471,25 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) btrfs_warn(fs_info, "attempt to delete subvolume %llu during send", dest->root_key.objectid); - return -EPERM; + ret = -EPERM; + goto out_up_write; } if (atomic_read(&dest->nr_swapfiles)) { spin_unlock(&dest->root_item_lock); btrfs_warn(fs_info, "attempt to delete subvolume %llu with active swapfile", root->root_key.objectid); - return -EPERM; + ret = -EPERM; + goto out_up_write; } root_flags = btrfs_root_flags(&dest->root_item); btrfs_set_root_flags(&dest->root_item, root_flags | BTRFS_ROOT_SUBVOL_DEAD); spin_unlock(&dest->root_item_lock); - down_write(&fs_info->subvol_sem); - ret = may_destroy_subvol(dest); if (ret) - goto out_up_write; + goto out_undead; btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP); /* @@ -4497,7 +4499,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) */ ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 5, true); if (ret) - goto out_up_write; + goto out_undead; trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { @@ -4563,15 +4565,17 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) inode->i_flags |= S_DEAD; out_release: btrfs_subvolume_release_metadata(root, &block_rsv); -out_up_write: - up_write(&fs_info->subvol_sem); +out_undead: if (ret) { spin_lock(&dest->root_item_lock); root_flags = btrfs_root_flags(&dest->root_item); btrfs_set_root_flags(&dest->root_item, root_flags & ~BTRFS_ROOT_SUBVOL_DEAD); spin_unlock(&dest->root_item_lock); - } else { + } +out_up_write: + up_write(&fs_info->subvol_sem); + if (!ret) { d_invalidate(dentry); btrfs_prune_dentries(dest); ASSERT(dest->send_in_progress == 0); From 7974b2128489d062c9d21419633eebde07f07032 Mon Sep 17 00:00:00 2001 From: Zijun Hu Date: Tue, 9 Jan 2024 19:03:23 +0800 Subject: [PATCH 095/707] Bluetooth: hci_event: Fix wrongly recorded wakeup BD_ADDR hci_store_wake_reason() wrongly parses event HCI_Connection_Request as HCI_Connection_Complete and HCI_Connection_Complete as HCI_Connection_Request, so causes recording wakeup BD_ADDR error and potential stability issue, fix it by using the correct field. Fixes: 2f20216c1d6f ("Bluetooth: Emit controller suspend and resume events") Signed-off-by: Zijun Hu Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_event.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index ef8c3bed73617e..22b22c264c2a5e 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -7420,10 +7420,10 @@ static void hci_store_wake_reason(struct hci_dev *hdev, u8 event, * keep track of the bdaddr of the connection event that woke us up. */ if (event == HCI_EV_CONN_REQUEST) { - bacpy(&hdev->wake_addr, &conn_complete->bdaddr); + bacpy(&hdev->wake_addr, &conn_request->bdaddr); hdev->wake_addr_type = BDADDR_BREDR; } else if (event == HCI_EV_CONN_COMPLETE) { - bacpy(&hdev->wake_addr, &conn_request->bdaddr); + bacpy(&hdev->wake_addr, &conn_complete->bdaddr); hdev->wake_addr_type = BDADDR_BREDR; } else if (event == HCI_EV_LE_META) { struct hci_ev_le_meta *le_ev = (void *)skb->data; From a7ee39bea31519849985453b9acbab06894ee0d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jonas=20Dre=C3=9Fler?= Date: Mon, 8 Jan 2024 23:46:06 +0100 Subject: [PATCH 096/707] Bluetooth: Remove superfluous call to hci_conn_check_pending() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The "pending connections" feature was originally introduced with commit 4c67bc74f016 ("[Bluetooth] Support concurrent connect requests") and 6bd57416127e ("[Bluetooth] Handling pending connect attempts after inquiry") to handle controllers supporting only a single connection request at a time. Later things were extended to also cancel ongoing inquiries on connect() with commit 89e65975fea5 ("Bluetooth: Cancel Inquiry before Create Connection"). With commit a9de9248064b ("[Bluetooth] Switch from OGF+OCF to using only opcodes"), hci_conn_check_pending() was introduced as a helper to consolidate a few places where we check for pending connections (indicated by the BT_CONNECT2 flag) and then try to connect. This refactoring commit also snuck in two more calls to hci_conn_check_pending(): - One is in the failure callback of hci_cs_inquiry(), this one probably makes sense: If we send an "HCI Inquiry" command and then immediately after a "Create Connection" command, the "Create Connection" command might fail before the "HCI Inquiry" command, and then we want to retry the "Create Connection" on failure of the "HCI Inquiry". - The other added call to hci_conn_check_pending() is in the event handler for the "Remote Name" event, this seems unrelated and is possibly a copy-paste error, so remove that one. Fixes: a9de9248064b ("[Bluetooth] Switch from OGF+OCF to using only opcodes") Signed-off-by: Jonas Dreßler Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_event.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 22b22c264c2a5e..23e0e63ac312be 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -3556,8 +3556,6 @@ static void hci_remote_name_evt(struct hci_dev *hdev, void *data, bt_dev_dbg(hdev, "status 0x%2.2x", ev->status); - hci_conn_check_pending(hdev); - hci_dev_lock(hdev); conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &ev->bdaddr); From f8c47ee39e6dc6170da06865b84e8c8b08e87ab0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jonas=20Dre=C3=9Fler?= Date: Mon, 8 Jan 2024 23:46:07 +0100 Subject: [PATCH 097/707] Bluetooth: hci_event: Use HCI error defines instead of magic values MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We have error defines already, so let's use them. Signed-off-by: Jonas Dreßler Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci.h | 2 ++ net/bluetooth/hci_event.c | 8 ++++---- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index a94a8491ec7a1a..1cd212bb378916 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -653,6 +653,7 @@ enum { #define HCI_ERROR_PIN_OR_KEY_MISSING 0x06 #define HCI_ERROR_MEMORY_EXCEEDED 0x07 #define HCI_ERROR_CONNECTION_TIMEOUT 0x08 +#define HCI_ERROR_COMMAND_DISALLOWED 0x0c #define HCI_ERROR_REJ_LIMITED_RESOURCES 0x0d #define HCI_ERROR_REJ_BAD_ADDR 0x0f #define HCI_ERROR_INVALID_PARAMETERS 0x12 @@ -661,6 +662,7 @@ enum { #define HCI_ERROR_REMOTE_POWER_OFF 0x15 #define HCI_ERROR_LOCAL_HOST_TERM 0x16 #define HCI_ERROR_PAIRING_NOT_ALLOWED 0x18 +#define HCI_ERROR_UNSUPPORTED_REMOTE_FEATURE 0x1e #define HCI_ERROR_INVALID_LL_PARAMS 0x1e #define HCI_ERROR_UNSPECIFIED 0x1f #define HCI_ERROR_ADVERTISING_TIMEOUT 0x3c diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 23e0e63ac312be..6130c969f361a7 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -95,11 +95,11 @@ static u8 hci_cc_inquiry_cancel(struct hci_dev *hdev, void *data, /* It is possible that we receive Inquiry Complete event right * before we receive Inquiry Cancel Command Complete event, in * which case the latter event should have status of Command - * Disallowed (0x0c). This should not be treated as error, since + * Disallowed. This should not be treated as error, since * we actually achieve what Inquiry Cancel wants to achieve, * which is to end the last Inquiry session. */ - if (rp->status == 0x0c && !test_bit(HCI_INQUIRY, &hdev->flags)) { + if (rp->status == HCI_ERROR_COMMAND_DISALLOWED && !test_bit(HCI_INQUIRY, &hdev->flags)) { bt_dev_warn(hdev, "Ignoring error of Inquiry Cancel command"); rp->status = 0x00; } @@ -2342,7 +2342,7 @@ static void hci_cs_create_conn(struct hci_dev *hdev, __u8 status) if (status) { if (conn && conn->state == BT_CONNECT) { - if (status != 0x0c || conn->attempt > 2) { + if (status != HCI_ERROR_COMMAND_DISALLOWED || conn->attempt > 2) { conn->state = BT_CLOSED; hci_connect_cfm(conn, status); hci_conn_del(conn); @@ -6679,7 +6679,7 @@ static void hci_le_remote_feat_complete_evt(struct hci_dev *hdev, void *data, * transition into connected state and mark it as * successful. */ - if (!conn->out && ev->status == 0x1a && + if (!conn->out && ev->status == HCI_ERROR_UNSUPPORTED_REMOTE_FEATURE && (hdev->le_features[0] & HCI_LE_PERIPHERAL_FEATURES)) status = 0x00; else From 711c35949648ba19f54bce27b49ced0ad90b19b9 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Tue, 9 Jan 2024 13:45:40 -0500 Subject: [PATCH 098/707] Bluetooth: hci_core: Cancel request on command timeout If command has timed out call __hci_cmd_sync_cancel to notify the hci_req since it will inevitably cause a timeout. This also rework the code around __hci_cmd_sync_cancel since it was wrongly assuming it needs to cancel timer as well, but sometimes the timers have not been started or in fact they already had timed out in which case they don't need to be cancel yet again. Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_sync.h | 2 +- net/bluetooth/hci_core.c | 84 ++++++++++++++++++++++---------- net/bluetooth/hci_request.c | 2 +- net/bluetooth/hci_sync.c | 20 ++++---- net/bluetooth/mgmt.c | 2 +- 5 files changed, 71 insertions(+), 39 deletions(-) diff --git a/include/net/bluetooth/hci_sync.h b/include/net/bluetooth/hci_sync.h index 6efbc2152146bd..e2582c24254498 100644 --- a/include/net/bluetooth/hci_sync.h +++ b/include/net/bluetooth/hci_sync.h @@ -42,7 +42,7 @@ int __hci_cmd_sync_status_sk(struct hci_dev *hdev, u16 opcode, u32 plen, void hci_cmd_sync_init(struct hci_dev *hdev); void hci_cmd_sync_clear(struct hci_dev *hdev); void hci_cmd_sync_cancel(struct hci_dev *hdev, int err); -void __hci_cmd_sync_cancel(struct hci_dev *hdev, int err); +void hci_cmd_sync_cancel_sync(struct hci_dev *hdev, int err); int hci_cmd_sync_submit(struct hci_dev *hdev, hci_cmd_sync_work_func_t func, void *data, hci_cmd_sync_work_destroy_t destroy); diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index e5cb618fa6d39c..de730d210ccb45 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -1523,10 +1523,11 @@ static void hci_cmd_timeout(struct work_struct *work) cmd_timer.work); if (hdev->sent_cmd) { - struct hci_command_hdr *sent = (void *) hdev->sent_cmd->data; - u16 opcode = __le16_to_cpu(sent->opcode); + u16 opcode = hci_skb_opcode(hdev->sent_cmd); bt_dev_err(hdev, "command 0x%4.4x tx timeout", opcode); + + hci_cmd_sync_cancel_sync(hdev, ETIMEDOUT); } else { bt_dev_err(hdev, "command tx timeout"); } @@ -2857,6 +2858,23 @@ int hci_unregister_suspend_notifier(struct hci_dev *hdev) return ret; } +/* Cancel ongoing command synchronously: + * + * - Cancel command timer + * - Reset command counter + * - Cancel command request + */ +static void hci_cancel_cmd_sync(struct hci_dev *hdev, int err) +{ + bt_dev_dbg(hdev, "err 0x%2.2x", err); + + cancel_delayed_work_sync(&hdev->cmd_timer); + cancel_delayed_work_sync(&hdev->ncmd_timer); + atomic_set(&hdev->cmd_cnt, 1); + + hci_cmd_sync_cancel_sync(hdev, -err); +} + /* Suspend HCI device */ int hci_suspend_dev(struct hci_dev *hdev) { @@ -2874,7 +2892,7 @@ int hci_suspend_dev(struct hci_dev *hdev) return 0; /* Cancel potentially blocking sync operation before suspend */ - __hci_cmd_sync_cancel(hdev, -EHOSTDOWN); + hci_cancel_cmd_sync(hdev, -EHOSTDOWN); hci_req_sync_lock(hdev); ret = hci_suspend_sync(hdev); @@ -4159,6 +4177,33 @@ static void hci_rx_work(struct work_struct *work) } } +static void hci_send_cmd_sync(struct hci_dev *hdev, struct sk_buff *skb) +{ + int err; + + bt_dev_dbg(hdev, "skb %p", skb); + + kfree_skb(hdev->sent_cmd); + + hdev->sent_cmd = skb_clone(skb, GFP_KERNEL); + if (!hdev->sent_cmd) { + skb_queue_head(&hdev->cmd_q, skb); + queue_work(hdev->workqueue, &hdev->cmd_work); + return; + } + + err = hci_send_frame(hdev, skb); + if (err < 0) { + hci_cmd_sync_cancel_sync(hdev, err); + return; + } + + if (hci_req_status_pend(hdev)) + hci_dev_set_flag(hdev, HCI_CMD_PENDING); + + atomic_dec(&hdev->cmd_cnt); +} + static void hci_cmd_work(struct work_struct *work) { struct hci_dev *hdev = container_of(work, struct hci_dev, cmd_work); @@ -4173,30 +4218,15 @@ static void hci_cmd_work(struct work_struct *work) if (!skb) return; - kfree_skb(hdev->sent_cmd); - - hdev->sent_cmd = skb_clone(skb, GFP_KERNEL); - if (hdev->sent_cmd) { - int res; - if (hci_req_status_pend(hdev)) - hci_dev_set_flag(hdev, HCI_CMD_PENDING); - atomic_dec(&hdev->cmd_cnt); + hci_send_cmd_sync(hdev, skb); - res = hci_send_frame(hdev, skb); - if (res < 0) - __hci_cmd_sync_cancel(hdev, -res); - - rcu_read_lock(); - if (test_bit(HCI_RESET, &hdev->flags) || - hci_dev_test_flag(hdev, HCI_CMD_DRAIN_WORKQUEUE)) - cancel_delayed_work(&hdev->cmd_timer); - else - queue_delayed_work(hdev->workqueue, &hdev->cmd_timer, - HCI_CMD_TIMEOUT); - rcu_read_unlock(); - } else { - skb_queue_head(&hdev->cmd_q, skb); - queue_work(hdev->workqueue, &hdev->cmd_work); - } + rcu_read_lock(); + if (test_bit(HCI_RESET, &hdev->flags) || + hci_dev_test_flag(hdev, HCI_CMD_DRAIN_WORKQUEUE)) + cancel_delayed_work(&hdev->cmd_timer); + else + queue_delayed_work(hdev->workqueue, &hdev->cmd_timer, + HCI_CMD_TIMEOUT); + rcu_read_unlock(); } } diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c index 6e023b0104b039..00e02138003ece 100644 --- a/net/bluetooth/hci_request.c +++ b/net/bluetooth/hci_request.c @@ -895,7 +895,7 @@ void hci_request_setup(struct hci_dev *hdev) void hci_request_cancel_all(struct hci_dev *hdev) { - __hci_cmd_sync_cancel(hdev, ENODEV); + hci_cmd_sync_cancel_sync(hdev, ENODEV); cancel_interleave_scan(hdev); } diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index b146562a65fc40..1122296ce3fa3f 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -584,7 +584,7 @@ void hci_cmd_sync_clear(struct hci_dev *hdev) mutex_unlock(&hdev->cmd_sync_work_lock); } -void __hci_cmd_sync_cancel(struct hci_dev *hdev, int err) +void hci_cmd_sync_cancel(struct hci_dev *hdev, int err) { bt_dev_dbg(hdev, "err 0x%2.2x", err); @@ -592,15 +592,17 @@ void __hci_cmd_sync_cancel(struct hci_dev *hdev, int err) hdev->req_result = err; hdev->req_status = HCI_REQ_CANCELED; - cancel_delayed_work_sync(&hdev->cmd_timer); - cancel_delayed_work_sync(&hdev->ncmd_timer); - atomic_set(&hdev->cmd_cnt, 1); - - wake_up_interruptible(&hdev->req_wait_q); + queue_work(hdev->workqueue, &hdev->cmd_sync_cancel_work); } } +EXPORT_SYMBOL(hci_cmd_sync_cancel); -void hci_cmd_sync_cancel(struct hci_dev *hdev, int err) +/* Cancel ongoing command request synchronously: + * + * - Set result and mark status to HCI_REQ_CANCELED + * - Wakeup command sync thread + */ +void hci_cmd_sync_cancel_sync(struct hci_dev *hdev, int err) { bt_dev_dbg(hdev, "err 0x%2.2x", err); @@ -608,10 +610,10 @@ void hci_cmd_sync_cancel(struct hci_dev *hdev, int err) hdev->req_result = err; hdev->req_status = HCI_REQ_CANCELED; - queue_work(hdev->workqueue, &hdev->cmd_sync_cancel_work); + wake_up_interruptible(&hdev->req_wait_q); } } -EXPORT_SYMBOL(hci_cmd_sync_cancel); +EXPORT_SYMBOL(hci_cmd_sync_cancel_sync); /* Submit HCI command to be run in as cmd_sync_work: * diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index cabc5466401754..173986f3405f7a 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -1415,7 +1415,7 @@ static int set_powered(struct sock *sk, struct hci_dev *hdev, void *data, /* Cancel potentially blocking sync operation before power off */ if (cp->val == 0x00) { - __hci_cmd_sync_cancel(hdev, -EHOSTDOWN); + hci_cmd_sync_cancel_sync(hdev, -EHOSTDOWN); err = hci_cmd_sync_queue(hdev, set_powered_sync, cmd, mgmt_set_powered_complete); } else { From eaab5c300eab901be0040aa03bf22b20cc6ce0a5 Mon Sep 17 00:00:00 2001 From: Ulrik Strid Date: Sat, 13 Jan 2024 15:27:38 +0800 Subject: [PATCH 099/707] Bluetooth: btusb: Add new VID/PID 13d3/3602 for MT7925 Add VID 13d3 & PID 3602 for MediaTek MT7925 USB Bluetooth chip. The information in /sys/kernel/debug/usb/devices about the Bluetooth device is listed as the below. T: Bus=07 Lev=01 Prnt=01 Port=10 Cnt=02 Dev#= 2 Spd=480 MxCh= 0 D: Ver= 2.10 Cls=ef(misc ) Sub=02 Prot=01 MxPS=64 #Cfgs= 1 P: Vendor=13d3 ProdID=3602 Rev= 1.00 S: Manufacturer=MediaTek Inc. S: Product=Wireless_Device S: SerialNumber=000000000 C:* #Ifs= 3 Cfg#= 1 Atr=e0 MxPwr=100mA A: FirstIf#= 0 IfCount= 3 Cls=e0(wlcon) Sub=01 Prot=01 I:* If#= 0 Alt= 0 #EPs= 3 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=81(I) Atr=03(Int.) MxPS= 16 Ivl=125us E: Ad=82(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 1 Alt= 0 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=83(I) Atr=01(Isoc) MxPS= 0 Ivl=1ms E: Ad=03(O) Atr=01(Isoc) MxPS= 0 Ivl=1ms I: If#= 1 Alt= 1 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=83(I) Atr=01(Isoc) MxPS= 9 Ivl=1ms E: Ad=03(O) Atr=01(Isoc) MxPS= 9 Ivl=1ms I: If#= 1 Alt= 2 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=83(I) Atr=01(Isoc) MxPS= 17 Ivl=1ms E: Ad=03(O) Atr=01(Isoc) MxPS= 17 Ivl=1ms I: If#= 1 Alt= 3 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=83(I) Atr=01(Isoc) MxPS= 25 Ivl=1ms E: Ad=03(O) Atr=01(Isoc) MxPS= 25 Ivl=1ms I: If#= 1 Alt= 4 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=83(I) Atr=01(Isoc) MxPS= 33 Ivl=1ms E: Ad=03(O) Atr=01(Isoc) MxPS= 33 Ivl=1ms I: If#= 1 Alt= 5 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=83(I) Atr=01(Isoc) MxPS= 49 Ivl=1ms E: Ad=03(O) Atr=01(Isoc) MxPS= 49 Ivl=1ms I: If#= 1 Alt= 6 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=83(I) Atr=01(Isoc) MxPS= 63 Ivl=1ms E: Ad=03(O) Atr=01(Isoc) MxPS= 63 Ivl=1ms I:* If#= 2 Alt= 0 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=(none) E: Ad=8a(I) Atr=03(Int.) MxPS= 64 Ivl=125us E: Ad=0a(O) Atr=03(Int.) MxPS= 64 Ivl=125us I: If#= 2 Alt= 1 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=(none) E: Ad=8a(I) Atr=03(Int.) MxPS= 512 Ivl=125us E: Ad=0a(O) Atr=03(Int.) MxPS= 512 Ivl=125us Signed-off-by: Ulrik Strid Signed-off-by: Deren Wu Signed-off-by: Luiz Augusto von Dentz --- drivers/bluetooth/btusb.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c index a0a317bac0954f..c497d87ddf7b8b 100644 --- a/drivers/bluetooth/btusb.c +++ b/drivers/bluetooth/btusb.c @@ -658,6 +658,11 @@ static const struct usb_device_id quirks_table[] = { BTUSB_WIDEBAND_SPEECH | BTUSB_VALID_LE_STATES }, + /* Additional MediaTek MT7925 Bluetooth devices */ + { USB_DEVICE(0x13d3, 0x3602), .driver_info = BTUSB_MEDIATEK | + BTUSB_WIDEBAND_SPEECH | + BTUSB_VALID_LE_STATES }, + /* Additional Realtek 8723AE Bluetooth devices */ { USB_DEVICE(0x0930, 0x021d), .driver_info = BTUSB_REALTEK }, { USB_DEVICE(0x13d3, 0x3394), .driver_info = BTUSB_REALTEK }, From 6061d66bd0e5ac1eca2858356cd8d7c12f415176 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Mon, 15 Jan 2024 21:12:19 +0100 Subject: [PATCH 100/707] Bluetooth: Remove usage of the deprecated ida_simple_xx() API ida_alloc() and ida_free() should be preferred to the deprecated ida_simple_get() and ida_simple_remove(). Note that the upper limit of ida_simple_get() is exclusive, but the one of ida_alloc_max() is inclusive. So a -1 has been added when needed. Signed-off-by: Christophe JAILLET Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_core.c | 9 +++++---- net/bluetooth/hci_sock.c | 4 ++-- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index de730d210ccb45..34c8dca2069f6b 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -2640,10 +2640,11 @@ int hci_register_dev(struct hci_dev *hdev) */ switch (hdev->dev_type) { case HCI_PRIMARY: - id = ida_simple_get(&hci_index_ida, 0, HCI_MAX_ID, GFP_KERNEL); + id = ida_alloc_max(&hci_index_ida, HCI_MAX_ID - 1, GFP_KERNEL); break; case HCI_AMP: - id = ida_simple_get(&hci_index_ida, 1, HCI_MAX_ID, GFP_KERNEL); + id = ida_alloc_range(&hci_index_ida, 1, HCI_MAX_ID - 1, + GFP_KERNEL); break; default: return -EINVAL; @@ -2742,7 +2743,7 @@ int hci_register_dev(struct hci_dev *hdev) destroy_workqueue(hdev->workqueue); destroy_workqueue(hdev->req_workqueue); err: - ida_simple_remove(&hci_index_ida, hdev->id); + ida_free(&hci_index_ida, hdev->id); return error; } @@ -2825,7 +2826,7 @@ void hci_release_dev(struct hci_dev *hdev) hci_dev_unlock(hdev); ida_destroy(&hdev->unset_handle_ida); - ida_simple_remove(&hci_index_ida, hdev->id); + ida_free(&hci_index_ida, hdev->id); kfree_skb(hdev->sent_cmd); kfree_skb(hdev->recv_event); kfree(hdev); diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index 3e7cd330d731ac..4ee1b976678b25 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -101,7 +101,7 @@ static bool hci_sock_gen_cookie(struct sock *sk) int id = hci_pi(sk)->cookie; if (!id) { - id = ida_simple_get(&sock_cookie_ida, 1, 0, GFP_KERNEL); + id = ida_alloc_min(&sock_cookie_ida, 1, GFP_KERNEL); if (id < 0) id = 0xffffffff; @@ -119,7 +119,7 @@ static void hci_sock_free_cookie(struct sock *sk) if (id) { hci_pi(sk)->cookie = 0xffffffff; - ida_simple_remove(&sock_cookie_ida, id); + ida_free(&sock_cookie_ida, id); } } From 7f29d67809293992a721edeab3903ad498e0e9cd Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Fri, 22 Dec 2023 22:37:03 -0500 Subject: [PATCH 101/707] ovl: Reject mounting case-insensitive filesystems overlayfs relies on the filesystem setting DCACHE_OP_HASH or DCACHE_OP_COMPARE to reject mounting over case-insensitive directories. Since commit bb9cd9106b22 ("fscrypt: Have filesystems handle their d_ops"), we set ->d_op through a hook in ->d_lookup, which means the root dentry won't have them, causing the mount to accidentally succeed. In v6.7-rc7, the following sequence will succeed to mount, but any dentry other than the root dentry will be a "weird" dentry to ovl and fail with EREMOTE. mkfs.ext4 -O casefold lower.img mount -O loop lower.img lower mount -t overlay -o lowerdir=lower,upperdir=upper,workdir=work ovl /mnt Mounting on a subdirectory fails, as expected, because DCACHE_OP_HASH and DCACHE_OP_COMPARE are properly set by ->lookup. Fix by explicitly rejecting superblocks that allow case-insensitive dentries. While there, re-sort the entries to have more descriptive error messages first. Fixes: bb9cd9106b22 ("fscrypt: Have filesystems handle their d_ops") Signed-off-by: Gabriel Krisman Bertazi Acked-by: Amir Goldstein --- fs/overlayfs/params.c | 13 ++++++++++--- include/linux/fs.h | 9 +++++++++ 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/fs/overlayfs/params.c b/fs/overlayfs/params.c index 112b4b12f8252a..a3edd9e1bdc464 100644 --- a/fs/overlayfs/params.c +++ b/fs/overlayfs/params.c @@ -280,12 +280,19 @@ static int ovl_mount_dir_check(struct fs_context *fc, const struct path *path, { struct ovl_fs_context *ctx = fc->fs_private; - if (ovl_dentry_weird(path->dentry)) - return invalfc(fc, "filesystem on %s not supported", name); - if (!d_is_dir(path->dentry)) return invalfc(fc, "%s is not a directory", name); + /* + * Root dentries of case-insensitive filesystems might not have + * the dentry operations set, but still be incompatible with + * overlayfs. Check explicitly to prevent post-mount failures. + */ + if (sb_has_encoding(path->mnt->mnt_sb)) + return invalfc(fc, "case-insensitive filesystem on %s not supported", name); + + if (ovl_dentry_weird(path->dentry)) + return invalfc(fc, "filesystem on %s not supported", name); /* * Check whether upper path is read-only here to report failures diff --git a/include/linux/fs.h b/include/linux/fs.h index e6ba0cc6f2eeea..a0eb8b5759a6fe 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3282,6 +3282,15 @@ extern int generic_check_addressable(unsigned, u64); extern void generic_set_encrypted_ci_d_ops(struct dentry *dentry); +static inline bool sb_has_encoding(const struct super_block *sb) +{ +#if IS_ENABLED(CONFIG_UNICODE) + return !!sb->s_encoding; +#else + return false; +#endif +} + int may_setattr(struct mnt_idmap *idmap, struct inode *inode, unsigned int ia_valid); int setattr_prepare(struct mnt_idmap *, struct dentry *, struct iattr *); From dd7adce79e7748a946f917ce15106a89132daa10 Mon Sep 17 00:00:00 2001 From: Jani Nikula Date: Wed, 17 Jan 2024 13:44:05 +0200 Subject: [PATCH 102/707] hwmon: put HWMON_CHANNEL_INFO() initializers in rodata HWMON_CHANNEL_INFO() is supposed to be used as initializer for arrays of const struct hwmon_channel_info *. However, without explicit const, HWMON_CHANNEL_INFO() creates mutable compound literals, and the const pointers point at the mutable data. Add const to place the data in rodata. Cc: Jean Delvare Cc: Guenter Roeck Signed-off-by: Jani Nikula Link: https://lore.kernel.org/r/20240117114405.1506775-1-jani.nikula@intel.com Signed-off-by: Guenter Roeck --- include/linux/hwmon.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/include/linux/hwmon.h b/include/linux/hwmon.h index 8cd6a6b335930d..c2c0da18dfa369 100644 --- a/include/linux/hwmon.h +++ b/include/linux/hwmon.h @@ -425,12 +425,12 @@ struct hwmon_channel_info { const u32 *config; }; -#define HWMON_CHANNEL_INFO(stype, ...) \ - (&(struct hwmon_channel_info) { \ - .type = hwmon_##stype, \ - .config = (u32 []) { \ - __VA_ARGS__, 0 \ - } \ +#define HWMON_CHANNEL_INFO(stype, ...) \ + (&(const struct hwmon_channel_info) { \ + .type = hwmon_##stype, \ + .config = (const u32 []) { \ + __VA_ARGS__, 0 \ + } \ }) /** From 5cade5212484e9d552a0859b9d27d661a69c79f3 Mon Sep 17 00:00:00 2001 From: Forest Crossman Date: Sat, 13 Jan 2024 16:27:55 -0800 Subject: [PATCH 103/707] hwmon: (nct6683) Add another customer ID for MSI This value was found on an MSI PRO X670-P WIFI with an NCT6687D chip. Signed-off-by: Forest Crossman Link: https://lore.kernel.org/r/CAO3ALPwot01+bBisj7Roog7SD9UwV+y4NtiakKrBDE0tPvFhbw@mail.gmail.com Signed-off-by: Guenter Roeck --- Documentation/hwmon/nct6683.rst | 1 + drivers/hwmon/nct6683.c | 3 +++ 2 files changed, 4 insertions(+) diff --git a/Documentation/hwmon/nct6683.rst b/Documentation/hwmon/nct6683.rst index 3e7f6ee779c2f9..2a7a78eb1b4688 100644 --- a/Documentation/hwmon/nct6683.rst +++ b/Documentation/hwmon/nct6683.rst @@ -64,4 +64,5 @@ Intel DB85FL NCT6683D EC firmware version 1.0 build 04/03/13 ASRock X570 NCT6683D EC firmware version 1.0 build 06/28/19 ASRock X670E NCT6686D EC firmware version 1.0 build 05/19/22 MSI B550 NCT6687D EC firmware version 1.0 build 05/07/20 +MSI X670-P NCT6687D EC firmware version 0.0 build 09/27/22 =============== =============================================== diff --git a/drivers/hwmon/nct6683.c b/drivers/hwmon/nct6683.c index 3f3f7a88413e01..0d016fedb9c2eb 100644 --- a/drivers/hwmon/nct6683.c +++ b/drivers/hwmon/nct6683.c @@ -174,6 +174,7 @@ superio_exit(int ioreg) #define NCT6683_CUSTOMER_ID_MITAC 0xa0e #define NCT6683_CUSTOMER_ID_MSI 0x201 #define NCT6683_CUSTOMER_ID_MSI2 0x200 +#define NCT6683_CUSTOMER_ID_MSI3 0x207 #define NCT6683_CUSTOMER_ID_ASROCK 0xe2c #define NCT6683_CUSTOMER_ID_ASROCK2 0xe1b #define NCT6683_CUSTOMER_ID_ASROCK3 0x1631 @@ -1224,6 +1225,8 @@ static int nct6683_probe(struct platform_device *pdev) break; case NCT6683_CUSTOMER_ID_MSI2: break; + case NCT6683_CUSTOMER_ID_MSI3: + break; case NCT6683_CUSTOMER_ID_ASROCK: break; case NCT6683_CUSTOMER_ID_ASROCK2: From a53faa6bfa3bd670281e59d005482d24c98c4eb9 Mon Sep 17 00:00:00 2001 From: Michal Simek Date: Wed, 20 Dec 2023 14:12:13 +0100 Subject: [PATCH 104/707] dt-bindings: hwmon: ina2xx: Add label property Add a label property to allow a custom name to be used for identifying a device on the board. This is useful when multiple devices are present on the same board. Similar change was done by commit ffae65fb1ae4 ("dt-bindings: spi: spi-cadence: Add label property"). Signed-off-by: Michal Simek Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/6f3c57d08984c1978569d3918cb38eb295c0c67d.1703077926.git.michal.simek@amd.com Signed-off-by: Guenter Roeck --- Documentation/devicetree/bindings/hwmon/ti,ina2xx.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Documentation/devicetree/bindings/hwmon/ti,ina2xx.yaml b/Documentation/devicetree/bindings/hwmon/ti,ina2xx.yaml index 378d1f6aeeb3f5..8e5c1935b5f4a7 100644 --- a/Documentation/devicetree/bindings/hwmon/ti,ina2xx.yaml +++ b/Documentation/devicetree/bindings/hwmon/ti,ina2xx.yaml @@ -32,6 +32,9 @@ properties: reg: maxItems: 1 + label: + description: A descriptive name for this device. + shunt-resistor: description: Shunt resistor value in micro-Ohm. @@ -77,6 +80,7 @@ examples: power-sensor@44 { compatible = "ti,ina220"; reg = <0x44>; + label = "vdd_3v0"; shunt-resistor = <1000>; vs-supply = <&vdd_3v0>; }; From bd975f3fa28a0573781f4244c2b314ebfd7c1015 Mon Sep 17 00:00:00 2001 From: Michal Simek Date: Wed, 20 Dec 2023 14:12:14 +0100 Subject: [PATCH 105/707] dt-bindings: hwmon: ina2xx: Describe #io-channel-cells property There are two drivers in the Linux kernel. One is hwmon based and second IIO. IIO version requires to define #io-channel-cells to operate. Signed-off-by: Michal Simek Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/aa303b9fe3116e7f98d6b72822f7f57694366db3.1703077926.git.michal.simek@amd.com Signed-off-by: Guenter Roeck --- Documentation/devicetree/bindings/hwmon/ti,ina2xx.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Documentation/devicetree/bindings/hwmon/ti,ina2xx.yaml b/Documentation/devicetree/bindings/hwmon/ti,ina2xx.yaml index 8e5c1935b5f4a7..f324b627bf9c22 100644 --- a/Documentation/devicetree/bindings/hwmon/ti,ina2xx.yaml +++ b/Documentation/devicetree/bindings/hwmon/ti,ina2xx.yaml @@ -32,6 +32,9 @@ properties: reg: maxItems: 1 + "#io-channel-cells": + const: 1 + label: description: A descriptive name for this device. @@ -80,6 +83,7 @@ examples: power-sensor@44 { compatible = "ti,ina220"; reg = <0x44>; + #io-channel-cells = <1>; label = "vdd_3v0"; shunt-resistor = <1000>; vs-supply = <&vdd_3v0>; From cc85a2f966361054d32dd79a432b2fb6b54b3db8 Mon Sep 17 00:00:00 2001 From: Michal Simek Date: Mon, 8 Jan 2024 15:30:51 +0100 Subject: [PATCH 106/707] dt-bindings: hwmon: ina2xx: Describe ina260 chip Describe ina260 chip which is precision digital current and power monitor with precision integrated shunt resistor. Signed-off-by: Michal Simek Acked-by: Conor Dooley Link: https://lore.kernel.org/r/4c82dc4d412e91d1601c1da5bca1cdf1a91cd9b8.1704724242.git.michal.simek@amd.com Signed-off-by: Guenter Roeck --- Documentation/devicetree/bindings/hwmon/ti,ina2xx.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/hwmon/ti,ina2xx.yaml b/Documentation/devicetree/bindings/hwmon/ti,ina2xx.yaml index f324b627bf9c22..a099bb71415e01 100644 --- a/Documentation/devicetree/bindings/hwmon/ti,ina2xx.yaml +++ b/Documentation/devicetree/bindings/hwmon/ti,ina2xx.yaml @@ -28,6 +28,7 @@ properties: - ti,ina231 - ti,ina237 - ti,ina238 + - ti,ina260 reg: maxItems: 1 From 02db4b99fa850241a168c5238cf499109d263667 Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Mon, 22 Jan 2024 13:39:41 +0800 Subject: [PATCH 107/707] firewire: Kill unnecessary buf check in device_attribute.show Per Documentation/filesystems/sysfs.rst: > sysfs allocates a buffer of size (PAGE_SIZE) and passes it to the > method. So we can kill the unnecessary buf check safely. Signed-off-by: Li Zhijian Link: https://lore.kernel.org/r/20240122053942.80648-1-lizhijian@fujitsu.com Signed-off-by: Takashi Sakamoto --- drivers/firewire/core-device.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/drivers/firewire/core-device.c b/drivers/firewire/core-device.c index 0547253d16fe5d..47d6cb3dc916d2 100644 --- a/drivers/firewire/core-device.c +++ b/drivers/firewire/core-device.c @@ -323,7 +323,7 @@ static ssize_t show_immediate(struct device *dev, if (value < 0) return -ENOENT; - return snprintf(buf, buf ? PAGE_SIZE : 0, "0x%06x\n", value); + return snprintf(buf, PAGE_SIZE, "0x%06x\n", value); } #define IMMEDIATE_ATTR(name, key) \ @@ -335,8 +335,6 @@ static ssize_t show_text_leaf(struct device *dev, struct config_rom_attribute *attr = container_of(dattr, struct config_rom_attribute, attr); const u32 *directories[] = {NULL, NULL}; - size_t bufsize; - char dummy_buf[2]; int i, ret = -ENOENT; down_read(&fw_device_rwsem); @@ -358,15 +356,9 @@ static ssize_t show_text_leaf(struct device *dev, } } - if (buf) { - bufsize = PAGE_SIZE - 1; - } else { - buf = dummy_buf; - bufsize = 1; - } - for (i = 0; i < ARRAY_SIZE(directories) && !!directories[i]; ++i) { - int result = fw_csr_string(directories[i], attr->key, buf, bufsize); + int result = fw_csr_string(directories[i], attr->key, buf, + PAGE_SIZE - 1); // Detected. if (result >= 0) ret = result; From dd754748f1bef240c38f987cabd70366b7e91474 Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Mon, 22 Jan 2024 13:39:42 +0800 Subject: [PATCH 108/707] firewire: Convert snprintf/sprintf to sysfs_emit Per filesystems/sysfs.rst, show() should only use sysfs_emit() or sysfs_emit_at() when formatting the value to be returned to user space. coccinelle complains that there are still a couple of functions that use snprintf(). Convert them to sysfs_emit(). > drivers/firewire/core-device.c:326:8-16: WARNING: please use sysfs_emit or sysfs_emit_at No functional change intended Signed-off-by: Li Zhijian Link: https://lore.kernel.org/r/20240122053942.80648-2-lizhijian@fujitsu.com Signed-off-by: Takashi Sakamoto --- drivers/firewire/core-device.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/firewire/core-device.c b/drivers/firewire/core-device.c index 47d6cb3dc916d2..790985479ff3b8 100644 --- a/drivers/firewire/core-device.c +++ b/drivers/firewire/core-device.c @@ -323,7 +323,7 @@ static ssize_t show_immediate(struct device *dev, if (value < 0) return -ENOENT; - return snprintf(buf, PAGE_SIZE, "0x%06x\n", value); + return sysfs_emit(buf, "0x%06x\n", value); } #define IMMEDIATE_ATTR(name, key) \ @@ -474,7 +474,7 @@ static ssize_t is_local_show(struct device *dev, { struct fw_device *device = fw_device(dev); - return sprintf(buf, "%u\n", device->is_local); + return sysfs_emit(buf, "%u\n", device->is_local); } static int units_sprintf(char *buf, const u32 *directory) From 213b755e42e2e7127777f74d2174bb4843a9b03a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20J=C3=BCcker?= Date: Fri, 22 Dec 2023 00:02:58 +0100 Subject: [PATCH 109/707] ARM: defconfig: enable STMicroelectronics accelerometer and gyro for Exynos MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Enable STMicroelectronics accelerometer and gyro drivers for the Samsung P4note device family in exynos and multi_v7 defconfigs. Signed-off-by: Martin Jücker Link: https://lore.kernel.org/r/20231221230258.56272-2-martin.juecker@gmail.com Signed-off-by: Krzysztof Kozlowski --- arch/arm/configs/exynos_defconfig | 3 +++ arch/arm/configs/multi_v7_defconfig | 3 +++ 2 files changed, 6 insertions(+) diff --git a/arch/arm/configs/exynos_defconfig b/arch/arm/configs/exynos_defconfig index c98d5ff8a1ed08..7ad48fdda1dac6 100644 --- a/arch/arm/configs/exynos_defconfig +++ b/arch/arm/configs/exynos_defconfig @@ -318,8 +318,11 @@ CONFIG_EXTCON_MAX77693=y CONFIG_EXTCON_MAX8997=y CONFIG_EXYNOS5422_DMC=y CONFIG_IIO=y +CONFIG_IIO_ST_ACCEL_3AXIS=m +# CONFIG_IIO_ST_ACCEL_SPI_3AXIS is not set CONFIG_EXYNOS_ADC=y CONFIG_STMPE_ADC=y +CONFIG_IIO_ST_GYRO_3AXIS=m CONFIG_CM36651=y CONFIG_AK8975=y CONFIG_SENSORS_ISL29018=y diff --git a/arch/arm/configs/multi_v7_defconfig b/arch/arm/configs/multi_v7_defconfig index ecb3e286107a4c..0d885cb6120679 100644 --- a/arch/arm/configs/multi_v7_defconfig +++ b/arch/arm/configs/multi_v7_defconfig @@ -1150,6 +1150,8 @@ CONFIG_STM32_FMC2_EBI=y CONFIG_EXYNOS5422_DMC=m CONFIG_IIO=y CONFIG_IIO_SW_TRIGGER=y +CONFIG_IIO_ST_ACCEL_3AXIS=m +# CONFIG_IIO_ST_ACCEL_SPI_3AXIS is not set CONFIG_ASPEED_ADC=m CONFIG_AT91_ADC=m CONFIG_AT91_SAMA5D2_ADC=m @@ -1169,6 +1171,7 @@ CONFIG_IIO_CROS_EC_SENSORS_CORE=m CONFIG_IIO_CROS_EC_SENSORS=m CONFIG_STM32_DAC=m CONFIG_MPU3050_I2C=y +CONFIG_IIO_ST_GYRO_3AXIS=m CONFIG_CM36651=m CONFIG_IIO_CROS_EC_LIGHT_PROX=m CONFIG_SENSORS_ISL29018=y From 062bfa2943207a63315c812ebccadc4830e88d93 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 7 Jan 2024 22:19:47 +0900 Subject: [PATCH 110/707] kconfig: remove unneeded buffer allocation in zconf_initscan() In Kconfig, there is a stack to save the lexer state for each inclusion level. Currently, it operates as an empty stack, with the 'current_buf' always pointing to an empty buffer. There is no need to preallocate the buffer. Change it to a full stack. Signed-off-by: Masahiro Yamada --- scripts/kconfig/lexer.l | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/scripts/kconfig/lexer.l b/scripts/kconfig/lexer.l index cc386e44368346..d75423ec4eae16 100644 --- a/scripts/kconfig/lexer.l +++ b/scripts/kconfig/lexer.l @@ -391,9 +391,6 @@ void zconf_initscan(const char *name) exit(1); } - current_buf = xmalloc(sizeof(*current_buf)); - memset(current_buf, 0, sizeof(*current_buf)); - current_file = file_lookup(name); yylineno = 1; } @@ -403,9 +400,10 @@ void zconf_nextfile(const char *name) struct file *iter; struct file *file = file_lookup(name); struct buffer *buf = xmalloc(sizeof(*buf)); - memset(buf, 0, sizeof(*buf)); - current_buf->state = YY_CURRENT_BUFFER; + buf->state = YY_CURRENT_BUFFER; + buf->parent = current_buf; + current_buf = buf; yyin = zconf_fopen(file->name); if (!yyin) { fprintf(stderr, "%s:%d: can't open file \"%s\"\n", @@ -413,8 +411,6 @@ void zconf_nextfile(const char *name) exit(1); } yy_switch_to_buffer(yy_create_buffer(yyin, YY_BUF_SIZE)); - buf->parent = current_buf; - current_buf = buf; current_file->lineno = yylineno; file->parent = current_file; @@ -441,20 +437,21 @@ void zconf_nextfile(const char *name) static void zconf_endfile(void) { - struct buffer *parent; + struct buffer *tmp; current_file = current_file->parent; if (current_file) yylineno = current_file->lineno; - parent = current_buf->parent; - if (parent) { - fclose(yyin); - yy_delete_buffer(YY_CURRENT_BUFFER); - yy_switch_to_buffer(parent->state); - } - free(current_buf); - current_buf = parent; + if (!current_buf) + return; + + fclose(yyin); + yy_delete_buffer(YY_CURRENT_BUFFER); + yy_switch_to_buffer(current_buf->state); + tmp = current_buf; + current_buf = current_buf->parent; + free(tmp); } int zconf_lineno(void) From de085fa995076869340362f30828db9c683111b4 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 7 Jan 2024 22:19:48 +0900 Subject: [PATCH 111/707] kconfig: fix line number in recursive inclusion detection The error message shows a wrong line number if the 'source' directive is wrapped to the following line. [Test Code] source \ "Kconfig" This results in the following error message: Recursive inclusion detected. Inclusion path: current file : Kconfig included from: Kconfig:2 The correct message should be as follows: Recursive inclusion detected. Inclusion path: current file : Kconfig included from: Kconfig:1 Signed-off-by: Masahiro Yamada --- scripts/kconfig/lexer.l | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/scripts/kconfig/lexer.l b/scripts/kconfig/lexer.l index d75423ec4eae16..f93b535a080c00 100644 --- a/scripts/kconfig/lexer.l +++ b/scripts/kconfig/lexer.l @@ -33,6 +33,7 @@ static int text_size, text_asize; struct buffer { struct buffer *parent; YY_BUFFER_STATE state; + int yylineno; }; static struct buffer *current_buf; @@ -402,6 +403,7 @@ void zconf_nextfile(const char *name) struct buffer *buf = xmalloc(sizeof(*buf)); buf->state = YY_CURRENT_BUFFER; + buf->yylineno = yylineno; buf->parent = current_buf; current_buf = buf; yyin = zconf_fopen(file->name); @@ -412,7 +414,7 @@ void zconf_nextfile(const char *name) } yy_switch_to_buffer(yy_create_buffer(yyin, YY_BUF_SIZE)); - current_file->lineno = yylineno; + current_file->lineno = zconf_lineno(); file->parent = current_file; for (iter = current_file; iter; iter = iter->parent) { @@ -425,7 +427,7 @@ void zconf_nextfile(const char *name) do { iter = iter->parent; fprintf(stderr, " included from: %s:%d\n", - iter->name, iter->lineno - 1); + iter->name, iter->lineno); } while (strcmp(iter->name, file->name)); exit(1); } @@ -440,8 +442,6 @@ static void zconf_endfile(void) struct buffer *tmp; current_file = current_file->parent; - if (current_file) - yylineno = current_file->lineno; if (!current_buf) return; @@ -449,6 +449,7 @@ static void zconf_endfile(void) fclose(yyin); yy_delete_buffer(YY_CURRENT_BUFFER); yy_switch_to_buffer(current_buf->state); + yylineno = current_buf->yylineno; tmp = current_buf; current_buf = current_buf->parent; free(tmp); From 7c7f8529f43eccf2e8c47d15fa65a9868f443348 Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Fri, 12 Jan 2024 13:58:30 +0100 Subject: [PATCH 112/707] docs: kbuild/kconfig: reformat/cleanup This document was using headings in an odd way, causing the sidebar to be quite messy. I've adding new headings and turned some of the old headings into description lists. The indentation was a mix of spaces and tabs; I've turned them all into 4 spaces so it always reads correctly regardless of tab settings. Also use ``...`` instead of `...`; the difference is that `` is meant for "inline literals" (and renders in a monospace font) while ` is for "interpreted text" (and renders with italics). Also changed the title of the document to be more descriptive. Signed-off-by: Vegard Nossum Reviewed-by: Nathan Chancellor Tested-by: Nathan Chancellor Signed-off-by: Masahiro Yamada --- Documentation/kbuild/kconfig.rst | 363 ++++++++++++++----------------- 1 file changed, 166 insertions(+), 197 deletions(-) diff --git a/Documentation/kbuild/kconfig.rst b/Documentation/kbuild/kconfig.rst index c946eb44bd138e..fc4e845bc249ba 100644 --- a/Documentation/kbuild/kconfig.rst +++ b/Documentation/kbuild/kconfig.rst @@ -1,10 +1,10 @@ -=================== -Kconfig make config -=================== +================================= +Configuration targets and editors +================================= -This file contains some assistance for using `make *config`. +This file contains some assistance for using ``make *config``. -Use "make help" to list all of the possible configuration targets. +Use ``make help`` to list all of the possible configuration targets. The xconfig ('qconf'), menuconfig ('mconf'), and nconfig ('nconf') programs also have embedded help text. Be sure to check that for @@ -12,8 +12,9 @@ navigation, search, and other general help text. The gconfig ('gconf') program has limited help text. + General -------- +======= New kernel releases often introduce new config symbols. Often more important, new kernel releases may rename config symbols. When @@ -24,118 +25,102 @@ symbols have been introduced. To see a list of new config symbols, use:: - cp user/some/old.config .config - make listnewconfig + cp user/some/old.config .config + make listnewconfig and the config program will list any new symbols, one per line. Alternatively, you can use the brute force method:: - make oldconfig - scripts/diffconfig .config.old .config | less - ----------------------------------------------------------------------- - -Environment variables for `*config` + make oldconfig + scripts/diffconfig .config.old .config | less -KCONFIG_CONFIG --------------- -This environment variable can be used to specify a default kernel config -file name to override the default name of ".config". -KCONFIG_DEFCONFIG_LIST ----------------------- +Environment variables +===================== -This environment variable specifies a list of config files which can be used -as a base configuration in case the .config does not exist yet. Entries in -the list are separated with whitespaces to each other, and the first one -that exists is used. +Environment variables for ``*config``: -KCONFIG_OVERWRITECONFIG ------------------------ -If you set KCONFIG_OVERWRITECONFIG in the environment, Kconfig will not -break symlinks when .config is a symlink to somewhere else. +``KCONFIG_CONFIG`` + This environment variable can be used to specify a default kernel config + file name to override the default name of ".config". -KCONFIG_WARN_UNKNOWN_SYMBOLS ----------------------------- -This environment variable makes Kconfig warn about all unrecognized -symbols in the config input. +``KCONFIG_DEFCONFIG_LIST`` + This environment variable specifies a list of config files which can be + used as a base configuration in case the .config does not exist yet. + Entries in the list are separated with whitespaces to each other, and + the first one that exists is used. -KCONFIG_WERROR --------------- -If set, Kconfig treats warnings as errors. +``KCONFIG_OVERWRITECONFIG`` + If you set KCONFIG_OVERWRITECONFIG in the environment, Kconfig will not + break symlinks when .config is a symlink to somewhere else. -`CONFIG_` ---------- -If you set `CONFIG_` in the environment, Kconfig will prefix all symbols -with its value when saving the configuration, instead of using the default, -`CONFIG_`. +``KCONFIG_WARN_UNKNOWN_SYMBOLS`` + This environment variable makes Kconfig warn about all unrecognized + symbols in the config input. ----------------------------------------------------------------------- +``KCONFIG_WERROR`` + If set, Kconfig treats warnings as errors. -Environment variables for '{allyes/allmod/allno/rand}config' +``CONFIG_`` + If you set ``CONFIG_`` in the environment, Kconfig will prefix all symbols + with its value when saving the configuration, instead of using the + default, ``CONFIG_``. -KCONFIG_ALLCONFIG ------------------ -(partially based on lkml email from/by Rob Landley, re: miniconfig) +Environment variables for ``{allyes/allmod/allno/rand}config``: --------------------------------------------------- +``KCONFIG_ALLCONFIG`` + The allyesconfig/allmodconfig/allnoconfig/randconfig variants can also + use the environment variable KCONFIG_ALLCONFIG as a flag or a filename + that contains config symbols that the user requires to be set to a + specific value. If KCONFIG_ALLCONFIG is used without a filename where + KCONFIG_ALLCONFIG == "" or KCONFIG_ALLCONFIG == "1", ``make *config`` + checks for a file named "all{yes/mod/no/def/random}.config" + (corresponding to the ``*config`` command that was used) for symbol values + that are to be forced. If this file is not found, it checks for a + file named "all.config" to contain forced values. -The allyesconfig/allmodconfig/allnoconfig/randconfig variants can also -use the environment variable KCONFIG_ALLCONFIG as a flag or a filename -that contains config symbols that the user requires to be set to a -specific value. If KCONFIG_ALLCONFIG is used without a filename where -KCONFIG_ALLCONFIG == "" or KCONFIG_ALLCONFIG == "1", `make *config` -checks for a file named "all{yes/mod/no/def/random}.config" -(corresponding to the `*config` command that was used) for symbol values -that are to be forced. If this file is not found, it checks for a -file named "all.config" to contain forced values. + This enables you to create "miniature" config (miniconfig) or custom + config files containing just the config symbols that you are interested + in. Then the kernel config system generates the full .config file, + including symbols of your miniconfig file. -This enables you to create "miniature" config (miniconfig) or custom -config files containing just the config symbols that you are interested -in. Then the kernel config system generates the full .config file, -including symbols of your miniconfig file. - -This 'KCONFIG_ALLCONFIG' file is a config file which contains -(usually a subset of all) preset config symbols. These variable -settings are still subject to normal dependency checks. - -Examples:: + This ``KCONFIG_ALLCONFIG`` file is a config file which contains + (usually a subset of all) preset config symbols. These variable + settings are still subject to normal dependency checks. - KCONFIG_ALLCONFIG=custom-notebook.config make allnoconfig + Examples:: -or:: + KCONFIG_ALLCONFIG=custom-notebook.config make allnoconfig - KCONFIG_ALLCONFIG=mini.config make allnoconfig + or:: -or:: + KCONFIG_ALLCONFIG=mini.config make allnoconfig - make KCONFIG_ALLCONFIG=mini.config allnoconfig + or:: -These examples will disable most options (allnoconfig) but enable or -disable the options that are explicitly listed in the specified -mini-config files. + make KCONFIG_ALLCONFIG=mini.config allnoconfig ----------------------------------------------------------------------- + These examples will disable most options (allnoconfig) but enable or + disable the options that are explicitly listed in the specified + mini-config files. -Environment variables for 'randconfig' +Environment variables for ``randconfig``: -KCONFIG_SEED ------------- -You can set this to the integer value used to seed the RNG, if you want -to somehow debug the behaviour of the kconfig parser/frontends. -If not set, the current time will be used. +``KCONFIG_SEED`` + You can set this to the integer value used to seed the RNG, if you want + to somehow debug the behaviour of the kconfig parser/frontends. + If not set, the current time will be used. -KCONFIG_PROBABILITY -------------------- -This variable can be used to skew the probabilities. This variable can -be unset or empty, or set to three different formats: +``KCONFIG_PROBABILITY`` + This variable can be used to skew the probabilities. This variable can + be unset or empty, or set to three different formats: ======================= ================== ===================== - KCONFIG_PROBABILITY y:n split y:m:n split + KCONFIG_PROBABILITY y:n split y:m:n split ======================= ================== ===================== - unset or empty 50 : 50 33 : 33 : 34 - N N : 100-N N/2 : N/2 : 100-N + unset or empty 50 : 50 33 : 33 : 34 + N N : 100-N N/2 : N/2 : 100-N [1] N:M N+M : 100-(N+M) N : M : 100-(N+M) [2] N:M:L N : 100-N M : L : 100-(M+L) ======================= ================== ===================== @@ -149,112 +134,98 @@ that: Examples:: - KCONFIG_PROBABILITY=10 - 10% of booleans will be set to 'y', 90% to 'n' - 5% of tristates will be set to 'y', 5% to 'm', 90% to 'n' - KCONFIG_PROBABILITY=15:25 - 40% of booleans will be set to 'y', 60% to 'n' - 15% of tristates will be set to 'y', 25% to 'm', 60% to 'n' - KCONFIG_PROBABILITY=10:15:15 - 10% of booleans will be set to 'y', 90% to 'n' - 15% of tristates will be set to 'y', 15% to 'm', 70% to 'n' + KCONFIG_PROBABILITY=10 + 10% of booleans will be set to 'y', 90% to 'n' + 5% of tristates will be set to 'y', 5% to 'm', 90% to 'n' + KCONFIG_PROBABILITY=15:25 + 40% of booleans will be set to 'y', 60% to 'n' + 15% of tristates will be set to 'y', 25% to 'm', 60% to 'n' + KCONFIG_PROBABILITY=10:15:15 + 10% of booleans will be set to 'y', 90% to 'n' + 15% of tristates will be set to 'y', 15% to 'm', 70% to 'n' ----------------------------------------------------------------------- +Environment variables for ``syncconfig``: -Environment variables for 'syncconfig' +``KCONFIG_NOSILENTUPDATE`` + If this variable has a non-blank value, it prevents silent kernel + config updates (requires explicit updates). -KCONFIG_NOSILENTUPDATE ----------------------- -If this variable has a non-blank value, it prevents silent kernel -config updates (requires explicit updates). +``KCONFIG_AUTOCONFIG`` + This environment variable can be set to specify the path & name of the + "auto.conf" file. Its default value is "include/config/auto.conf". -KCONFIG_AUTOCONFIG ------------------- -This environment variable can be set to specify the path & name of the -"auto.conf" file. Its default value is "include/config/auto.conf". +``KCONFIG_AUTOHEADER`` + This environment variable can be set to specify the path & name of the + "autoconf.h" (header) file. + Its default value is "include/generated/autoconf.h". -KCONFIG_AUTOHEADER ------------------- -This environment variable can be set to specify the path & name of the -"autoconf.h" (header) file. -Its default value is "include/generated/autoconf.h". - - ----------------------------------------------------------------------- menuconfig ----------- - -SEARCHING for CONFIG symbols +========== Searching in menuconfig: - The Search function searches for kernel configuration symbol - names, so you have to know something close to what you are - looking for. + The Search function searches for kernel configuration symbol + names, so you have to know something close to what you are + looking for. - Example:: + Example:: - /hotplug - This lists all config symbols that contain "hotplug", - e.g., HOTPLUG_CPU, MEMORY_HOTPLUG. + /hotplug + This lists all config symbols that contain "hotplug", + e.g., HOTPLUG_CPU, MEMORY_HOTPLUG. - For search help, enter / followed by TAB-TAB (to highlight - ) and Enter. This will tell you that you can also use - regular expressions (regexes) in the search string, so if you - are not interested in MEMORY_HOTPLUG, you could try:: + For search help, enter / followed by TAB-TAB (to highlight + ) and Enter. This will tell you that you can also use + regular expressions (regexes) in the search string, so if you + are not interested in MEMORY_HOTPLUG, you could try:: - /^hotplug + /^hotplug - When searching, symbols are sorted thus: + When searching, symbols are sorted thus: - - first, exact matches, sorted alphabetically (an exact match - is when the search matches the complete symbol name); - - then, other matches, sorted alphabetically. + - first, exact matches, sorted alphabetically (an exact match + is when the search matches the complete symbol name); + - then, other matches, sorted alphabetically. - For example: ^ATH.K matches: + For example, ^ATH.K matches: - ATH5K ATH9K ATH5K_AHB ATH5K_DEBUG [...] ATH6KL ATH6KL_DEBUG - [...] ATH9K_AHB ATH9K_BTCOEX_SUPPORT ATH9K_COMMON [...] + ATH5K ATH9K ATH5K_AHB ATH5K_DEBUG [...] ATH6KL ATH6KL_DEBUG + [...] ATH9K_AHB ATH9K_BTCOEX_SUPPORT ATH9K_COMMON [...] - of which only ATH5K and ATH9K match exactly and so are sorted - first (and in alphabetical order), then come all other symbols, - sorted in alphabetical order. + of which only ATH5K and ATH9K match exactly and so are sorted + first (and in alphabetical order), then come all other symbols, + sorted in alphabetical order. - In this menu, pressing the key in the (#) prefix will jump - directly to that location. You will be returned to the current - search results after exiting this new menu. + In this menu, pressing the key in the (#) prefix will jump + directly to that location. You will be returned to the current + search results after exiting this new menu. ----------------------------------------------------------------------- +User interface options for 'menuconfig': -User interface options for 'menuconfig' +``MENUCONFIG_COLOR`` + It is possible to select different color themes using the variable + MENUCONFIG_COLOR. To select a theme use:: -MENUCONFIG_COLOR ----------------- -It is possible to select different color themes using the variable -MENUCONFIG_COLOR. To select a theme use:: + make MENUCONFIG_COLOR= menuconfig - make MENUCONFIG_COLOR= menuconfig + Available themes are:: -Available themes are:: + - mono => selects colors suitable for monochrome displays + - blackbg => selects a color scheme with black background + - classic => theme with blue background. The classic look + - bluetitle => a LCD friendly version of classic. (default) - - mono => selects colors suitable for monochrome displays - - blackbg => selects a color scheme with black background - - classic => theme with blue background. The classic look - - bluetitle => a LCD friendly version of classic. (default) +``MENUCONFIG_MODE`` + This mode shows all sub-menus in one large tree. -MENUCONFIG_MODE ---------------- -This mode shows all sub-menus in one large tree. + Example:: -Example:: + make MENUCONFIG_MODE=single_menu menuconfig - make MENUCONFIG_MODE=single_menu menuconfig - ----------------------------------------------------------------------- nconfig -------- +======= nconfig is an alternate text-based configurator. It lists function keys across the bottom of the terminal (window) that execute commands. @@ -266,61 +237,59 @@ Use F1 for Global help or F3 for the Short help menu. Searching in nconfig: - You can search either in the menu entry "prompt" strings - or in the configuration symbols. + You can search either in the menu entry "prompt" strings + or in the configuration symbols. + + Use / to begin a search through the menu entries. This does + not support regular expressions. Use or for + Next hit and Previous hit, respectively. Use to + terminate the search mode. - Use / to begin a search through the menu entries. This does - not support regular expressions. Use or for - Next hit and Previous hit, respectively. Use to - terminate the search mode. + F8 (SymSearch) searches the configuration symbols for the + given string or regular expression (regex). - F8 (SymSearch) searches the configuration symbols for the - given string or regular expression (regex). + In the SymSearch, pressing the key in the (#) prefix will + jump directly to that location. You will be returned to the + current search results after exiting this new menu. - In the SymSearch, pressing the key in the (#) prefix will - jump directly to that location. You will be returned to the - current search results after exiting this new menu. +Environment variables: -NCONFIG_MODE ------------- -This mode shows all sub-menus in one large tree. +``NCONFIG_MODE`` + This mode shows all sub-menus in one large tree. -Example:: + Example:: - make NCONFIG_MODE=single_menu nconfig + make NCONFIG_MODE=single_menu nconfig ----------------------------------------------------------------------- xconfig -------- +======= Searching in xconfig: - The Search function searches for kernel configuration symbol - names, so you have to know something close to what you are - looking for. - - Example:: + The Search function searches for kernel configuration symbol + names, so you have to know something close to what you are + looking for. - Ctrl-F hotplug + Example:: - or:: + Ctrl-F hotplug - Menu: File, Search, hotplug + or:: - lists all config symbol entries that contain "hotplug" in - the symbol name. In this Search dialog, you may change the - config setting for any of the entries that are not grayed out. - You can also enter a different search string without having - to return to the main menu. + Menu: File, Search, hotplug + lists all config symbol entries that contain "hotplug" in + the symbol name. In this Search dialog, you may change the + config setting for any of the entries that are not grayed out. + You can also enter a different search string without having + to return to the main menu. ----------------------------------------------------------------------- gconfig -------- +======= Searching in gconfig: - There is no search command in gconfig. However, gconfig does - have several different viewing choices, modes, and options. + There is no search command in gconfig. However, gconfig does + have several different viewing choices, modes, and options. From f98b755f7bee20cb09814c106c1d40386ed4172e Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sat, 13 Jan 2024 19:43:36 +0900 Subject: [PATCH 113/707] kbuild: deb-pkg: show verbose log for direct package builds When the Debian package build is initiated by Kbuild ('make deb-pkg' or 'make bindeb-pkg'), the log messages are displayed in the short form, which is the Kbuild default. Otherwise, let's show verbose messages (unless the 'terse' tag is set in DEB_BUILD_OPTION), as suggested by Debian Policy: "The package build should be as verbose as reasonably possible, except where the terse tag is included in DEB_BUILD_OPTIONS." [1] This is what the Debian kernel also does. [2] [1]: https://www.debian.org/doc/debian-policy/ch-source.html#main-building-script-debian-rules [2]: https://salsa.debian.org/kernel-team/linux/-/blob/debian/6.7-1_exp1/debian/rules.real#L36 Signed-off-by: Masahiro Yamada Reviewed-by: Nicolas Schier --- scripts/package/debian/rules | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/scripts/package/debian/rules b/scripts/package/debian/rules index 09830778006227..697fbfa7595f79 100755 --- a/scripts/package/debian/rules +++ b/scripts/package/debian/rules @@ -11,6 +11,14 @@ ifneq (,$(filter-out parallel=1,$(filter parallel=%,$(DEB_BUILD_OPTIONS)))) MAKEFLAGS += -j$(NUMJOBS) endif +# When KBUILD_VERBOSE is undefined (presumably you are directly working with +# the debianized tree), show verbose logs unless DEB_BUILD_OPTION=terse is set. +ifeq ($(origin KBUILD_VERBOSE),undefined) + ifeq (,$(filter terse,$(DEB_BUILD_OPTIONS))) + export KBUILD_VERBOSE := 1 + endif +endif + revision = $(lastword $(subst -, ,$(shell dpkg-parsechangelog -S Version))) CROSS_COMPILE ?= $(filter-out $(DEB_BUILD_GNU_TYPE)-, $(DEB_HOST_GNU_TYPE)-) make-opts = ARCH=$(ARCH) KERNELRELEASE=$(KERNELRELEASE) KBUILD_BUILD_VERSION=$(revision) $(addprefix CROSS_COMPILE=,$(CROSS_COMPILE)) From 057b438ff43bc31546ef297855fb7c64ad0e81e9 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sat, 13 Jan 2024 19:43:37 +0900 Subject: [PATCH 114/707] kbuild: deb-pkg: make debian/rules quiet for 'make deb-pkg' Add $(Q) to the commands in debian/rules to make them quiet when the package built is initiated by 'make deb-pkg' or when the 'terse' tag is set to DEB_BUILD_OPTIONS. Signed-off-by: Masahiro Yamada Reviewed-by: Nicolas Schier --- scripts/package/debian/rules | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/scripts/package/debian/rules b/scripts/package/debian/rules index 697fbfa7595f79..a183e95886e61d 100755 --- a/scripts/package/debian/rules +++ b/scripts/package/debian/rules @@ -16,6 +16,8 @@ endif ifeq ($(origin KBUILD_VERBOSE),undefined) ifeq (,$(filter terse,$(DEB_BUILD_OPTIONS))) export KBUILD_VERBOSE := 1 + else + Q := @ endif endif @@ -27,20 +29,20 @@ make-opts = ARCH=$(ARCH) KERNELRELEASE=$(KERNELRELEASE) KBUILD_BUILD_VERSION=$(r binary: binary-arch binary-indep binary-indep: build-indep binary-arch: build-arch - $(MAKE) $(make-opts) \ + $(Q)$(MAKE) $(make-opts) \ run-command KBUILD_RUN_COMMAND='+$$(srctree)/scripts/package/builddeb' .PHONY: build build-indep build-arch build: build-arch build-indep build-indep: build-arch: - $(MAKE) $(make-opts) olddefconfig - $(MAKE) $(make-opts) $(if $(filter um,$(ARCH)),,headers) all + $(Q)$(MAKE) $(make-opts) olddefconfig + $(Q)$(MAKE) $(make-opts) $(if $(filter um,$(ARCH)),,headers) all .PHONY: clean clean: - rm -rf debian/files debian/linux-* debian/deb-env.vars* - $(MAKE) ARCH=$(ARCH) clean + $(Q)rm -rf debian/files debian/linux-* debian/deb-env.vars* + $(Q)$(MAKE) ARCH=$(ARCH) clean # If DEB_HOST_ARCH is empty, it is likely that debian/rules was executed # directly. Run 'dpkg-architecture --print-set --print-format=make' to @@ -49,6 +51,6 @@ ifndef DEB_HOST_ARCH include debian/deb-env.vars debian/deb-env.vars: - dpkg-architecture -a$$(cat debian/arch) --print-set --print-format=make > $@.tmp - mv $@.tmp $@ + $(Q)dpkg-architecture -a$$(cat debian/arch) --print-set --print-format=make > $@.tmp + $(Q)mv $@.tmp $@ endif From 2c5251ac1ff06696881ff021e6d1b8a3e63d90e5 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sat, 13 Jan 2024 19:43:38 +0900 Subject: [PATCH 115/707] kbuild: deb-pkg: build binary-arch in parallel 'make deb-pkg' builds build-arch in parallel, but binary-arch serially. Given that all binary packages are independent of one another, they can be built in parallel. I am uncertain whether debian/files is robust against a race condition. Just in case, make dh_gencontrol (dpkg-gencontrol) output to separate debian/*.files, which are then concatenated into debian/files. Signed-off-by: Masahiro Yamada Reviewed-by: Nicolas Schier --- scripts/package/builddeb | 42 +++++++++++------------------------- scripts/package/debian/rules | 39 +++++++++++++++++++++++++++++---- 2 files changed, 47 insertions(+), 34 deletions(-) diff --git a/scripts/package/builddeb b/scripts/package/builddeb index bf96a3c2460814..d31b16afe0db8c 100755 --- a/scripts/package/builddeb +++ b/scripts/package/builddeb @@ -24,18 +24,6 @@ if_enabled_echo() { fi } -create_package() { - export DH_OPTIONS="-p${1}" - - dh_installdocs - dh_installchangelogs - dh_compress - dh_fixperms - dh_gencontrol - dh_md5sums - dh_builddeb -- ${KDEB_COMPRESS:+-Z$KDEB_COMPRESS} -} - install_linux_image () { pname=$1 pdir=debian/$1 @@ -161,21 +149,15 @@ install_libc_headers () { mv "$pdir/usr/include/asm" "$pdir/usr/include/${DEB_HOST_MULTIARCH}" } -rm -f debian/files - -packages_enabled=$(dh_listpackages) - -for package in ${packages_enabled} -do - case ${package} in - *-dbg) - install_linux_image_dbg "${package}";; - linux-image-*|user-mode-linux-*) - install_linux_image "${package}";; - linux-libc-dev) - install_libc_headers "${package}";; - linux-headers-*) - install_kernel_headers "${package}";; - esac - create_package "${package}" -done +package=$1 + +case "${package}" in +*-dbg) + install_linux_image_dbg "${package}";; +linux-image-*|user-mode-linux-*) + install_linux_image "${package}";; +linux-libc-dev) + install_libc_headers "${package}";; +linux-headers-*) + install_kernel_headers "${package}";; +esac diff --git a/scripts/package/debian/rules b/scripts/package/debian/rules index a183e95886e61d..57f1cf7c6b32f1 100755 --- a/scripts/package/debian/rules +++ b/scripts/package/debian/rules @@ -25,12 +25,43 @@ revision = $(lastword $(subst -, ,$(shell dpkg-parsechangelog -S Version))) CROSS_COMPILE ?= $(filter-out $(DEB_BUILD_GNU_TYPE)-, $(DEB_HOST_GNU_TYPE)-) make-opts = ARCH=$(ARCH) KERNELRELEASE=$(KERNELRELEASE) KBUILD_BUILD_VERSION=$(revision) $(addprefix CROSS_COMPILE=,$(CROSS_COMPILE)) +binary-targets := $(addprefix binary-, image image-dbg headers libc-dev) + +all-packages = $(shell dh_listpackages) +image-package = $(filter linux-image-% user-%, $(filter-out %-dbg, $(all-packages))) +image-dbg-package = $(filter %-dbg, $(all-packages)) +libc-dev-package = $(filter linux-libc-dev, $(all-packages)) +headers-package = $(filter linux-headers-%, $(all-packages)) + +mk-files = $(patsubst binary-%,debian/%.files,$1) +package = $($(@:binary-%=%-package)) + +# DH_OPTION is an environment variable common for all debhelper commands. +# We could 'export' it, but here it is passed from the command line to clarify +# which package is being processed in the build log. +DH_OPTIONS = -p$(package) + +define binary + $(Q)+$(MAKE) $(make-opts) run-command KBUILD_RUN_COMMAND='+$$(srctree)/scripts/package/builddeb $(package)' + $(Q)dh_installdocs $(DH_OPTIONS) + $(Q)dh_installchangelogs $(DH_OPTIONS) + $(Q)dh_compress $(DH_OPTIONS) + $(Q)dh_fixperms $(DH_OPTIONS) + $(Q)dh_gencontrol $(DH_OPTIONS) -- -f$(call mk-files,$@) + $(Q)dh_md5sums $(DH_OPTIONS) + $(Q)dh_builddeb $(DH_OPTIONS) -- $(addprefix -Z,$(KDEB_COMPRESS)) +endef + +.PHONY: $(binary-targets) +$(binary-targets): build-arch + $(Q)truncate -s0 $(call mk-files,$@) + $(if $(package),$(binary)) + .PHONY: binary binary-indep binary-arch binary: binary-arch binary-indep binary-indep: build-indep -binary-arch: build-arch - $(Q)$(MAKE) $(make-opts) \ - run-command KBUILD_RUN_COMMAND='+$$(srctree)/scripts/package/builddeb' +binary-arch: $(binary-targets) + $(Q)cat $(call mk-files,$^) > debian/files .PHONY: build build-indep build-arch build: build-arch build-indep @@ -41,7 +72,7 @@ build-arch: .PHONY: clean clean: - $(Q)rm -rf debian/files debian/linux-* debian/deb-env.vars* + $(Q)rm -rf debian/files debian/linux-* debian/deb-env.vars* debian/*.files $(Q)$(MAKE) ARCH=$(ARCH) clean # If DEB_HOST_ARCH is empty, it is likely that debian/rules was executed From bd768db42ef6d27c3eca1d29ec8beb4474e4ba35 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sat, 13 Jan 2024 19:43:39 +0900 Subject: [PATCH 116/707] kbuild: deb-pkg: call more misc debhelper commands Use dh_prep instead of removing old build directories manually. Use dh_clean instead of removing build directories and debian/files manually. Call dh_testdir and dh_testroot for preliminary checks. Signed-off-by: Masahiro Yamada Reviewed-by: Nicolas Schier --- scripts/package/builddeb | 8 -------- scripts/package/debian/rules | 6 +++++- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/scripts/package/builddeb b/scripts/package/builddeb index d31b16afe0db8c..e797ad360f7a5b 100755 --- a/scripts/package/builddeb +++ b/scripts/package/builddeb @@ -28,8 +28,6 @@ install_linux_image () { pname=$1 pdir=debian/$1 - rm -rf ${pdir} - # Only some architectures with OF support have this target if is_enabled CONFIG_OF_EARLY_FLATTREE && [ -d "${srctree}/arch/${SRCARCH}/boot/dts" ]; then ${MAKE} -f ${srctree}/Makefile INSTALL_DTBS_PATH="${pdir}/usr/lib/linux-image-${KERNELRELEASE}" dtbs_install @@ -97,8 +95,6 @@ install_linux_image () { install_linux_image_dbg () { pdir=debian/$1 - rm -rf ${pdir} - # Parse modules.order directly because 'make modules_install' may sign, # compress modules, and then run unneeded depmod. while read -r mod; do @@ -128,8 +124,6 @@ install_kernel_headers () { pdir=debian/$1 version=${1#linux-headers-} - rm -rf $pdir - "${srctree}/scripts/package/install-extmod-build" "${pdir}/usr/src/linux-headers-${version}" mkdir -p $pdir/lib/modules/$version/ @@ -139,8 +133,6 @@ install_kernel_headers () { install_libc_headers () { pdir=debian/$1 - rm -rf $pdir - $MAKE -f $srctree/Makefile headers_install INSTALL_HDR_PATH=$pdir/usr # move asm headers to /usr/include//asm to match the structure diff --git a/scripts/package/debian/rules b/scripts/package/debian/rules index 57f1cf7c6b32f1..ca07243bd5cdf6 100755 --- a/scripts/package/debian/rules +++ b/scripts/package/debian/rules @@ -42,6 +42,9 @@ package = $($(@:binary-%=%-package)) DH_OPTIONS = -p$(package) define binary + $(Q)dh_testdir $(DH_OPTIONS) + $(Q)dh_testroot $(DH_OPTIONS) + $(Q)dh_prep $(DH_OPTIONS) $(Q)+$(MAKE) $(make-opts) run-command KBUILD_RUN_COMMAND='+$$(srctree)/scripts/package/builddeb $(package)' $(Q)dh_installdocs $(DH_OPTIONS) $(Q)dh_installchangelogs $(DH_OPTIONS) @@ -72,7 +75,8 @@ build-arch: .PHONY: clean clean: - $(Q)rm -rf debian/files debian/linux-* debian/deb-env.vars* debian/*.files + $(Q)dh_clean + $(Q)rm -rf debian/deb-env.vars* debian/*.files $(Q)$(MAKE) ARCH=$(ARCH) clean # If DEB_HOST_ARCH is empty, it is likely that debian/rules was executed From df38347f1934b29d4b3075a53a6404b0d58dcb2f Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Fri, 5 Jan 2024 11:14:07 +0100 Subject: [PATCH 117/707] x86/sev: Harden #VC instruction emulation somewhat Compare the opcode bytes at rIP for each #VC exit reason to verify the instruction which raised the #VC exception is actually the right one. Signed-off-by: Borislav Petkov (AMD) Acked-by: Tom Lendacky Link: https://lore.kernel.org/r/20240105101407.11694-1-bp@alien8.de --- arch/x86/boot/compressed/sev.c | 4 ++ arch/x86/kernel/sev-shared.c | 102 ++++++++++++++++++++++++++++++++- arch/x86/kernel/sev.c | 5 +- 3 files changed, 108 insertions(+), 3 deletions(-) diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c index 454acd7a2dafff..073291832f44d2 100644 --- a/arch/x86/boot/compressed/sev.c +++ b/arch/x86/boot/compressed/sev.c @@ -304,6 +304,10 @@ void do_boot_stage2_vc(struct pt_regs *regs, unsigned long exit_code) if (result != ES_OK) goto finish; + result = vc_check_opcode_bytes(&ctxt, exit_code); + if (result != ES_OK) + goto finish; + switch (exit_code) { case SVM_EXIT_RDTSC: case SVM_EXIT_RDTSCP: diff --git a/arch/x86/kernel/sev-shared.c b/arch/x86/kernel/sev-shared.c index 1d24ec6799157b..5db24d0fc557cd 100644 --- a/arch/x86/kernel/sev-shared.c +++ b/arch/x86/kernel/sev-shared.c @@ -10,11 +10,15 @@ */ #ifndef __BOOT_COMPRESSED -#define error(v) pr_err(v) -#define has_cpuflag(f) boot_cpu_has(f) +#define error(v) pr_err(v) +#define has_cpuflag(f) boot_cpu_has(f) +#define sev_printk(fmt, ...) printk(fmt, ##__VA_ARGS__) +#define sev_printk_rtl(fmt, ...) printk_ratelimited(fmt, ##__VA_ARGS__) #else #undef WARN #define WARN(condition, format...) (!!(condition)) +#define sev_printk(fmt, ...) +#define sev_printk_rtl(fmt, ...) #endif /* I/O parameters for CPUID-related helpers */ @@ -574,6 +578,7 @@ void __init do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code) { unsigned int subfn = lower_bits(regs->cx, 32); unsigned int fn = lower_bits(regs->ax, 32); + u16 opcode = *(unsigned short *)regs->ip; struct cpuid_leaf leaf; int ret; @@ -581,6 +586,10 @@ void __init do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code) if (exit_code != SVM_EXIT_CPUID) goto fail; + /* Is it really a CPUID insn? */ + if (opcode != 0xa20f) + goto fail; + leaf.fn = fn; leaf.subfn = subfn; @@ -1170,3 +1179,92 @@ static int vmgexit_psc(struct ghcb *ghcb, struct snp_psc_desc *desc) out: return ret; } + +static enum es_result vc_check_opcode_bytes(struct es_em_ctxt *ctxt, + unsigned long exit_code) +{ + unsigned int opcode = (unsigned int)ctxt->insn.opcode.value; + u8 modrm = ctxt->insn.modrm.value; + + switch (exit_code) { + + case SVM_EXIT_IOIO: + case SVM_EXIT_NPF: + /* handled separately */ + return ES_OK; + + case SVM_EXIT_CPUID: + if (opcode == 0xa20f) + return ES_OK; + break; + + case SVM_EXIT_INVD: + if (opcode == 0x080f) + return ES_OK; + break; + + case SVM_EXIT_MONITOR: + if (opcode == 0x010f && modrm == 0xc8) + return ES_OK; + break; + + case SVM_EXIT_MWAIT: + if (opcode == 0x010f && modrm == 0xc9) + return ES_OK; + break; + + case SVM_EXIT_MSR: + /* RDMSR */ + if (opcode == 0x320f || + /* WRMSR */ + opcode == 0x300f) + return ES_OK; + break; + + case SVM_EXIT_RDPMC: + if (opcode == 0x330f) + return ES_OK; + break; + + case SVM_EXIT_RDTSC: + if (opcode == 0x310f) + return ES_OK; + break; + + case SVM_EXIT_RDTSCP: + if (opcode == 0x010f && modrm == 0xf9) + return ES_OK; + break; + + case SVM_EXIT_READ_DR7: + if (opcode == 0x210f && + X86_MODRM_REG(ctxt->insn.modrm.value) == 7) + return ES_OK; + break; + + case SVM_EXIT_VMMCALL: + if (opcode == 0x010f && modrm == 0xd9) + return ES_OK; + + break; + + case SVM_EXIT_WRITE_DR7: + if (opcode == 0x230f && + X86_MODRM_REG(ctxt->insn.modrm.value) == 7) + return ES_OK; + break; + + case SVM_EXIT_WBINVD: + if (opcode == 0x90f) + return ES_OK; + break; + + default: + break; + } + + sev_printk(KERN_ERR "Wrong/unhandled opcode bytes: 0x%x, exit_code: 0x%lx, rIP: 0x%lx\n", + opcode, exit_code, ctxt->regs->ip); + + return ES_UNSUPPORTED; +} diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c index c67285824e8267..1ec753331524ab 100644 --- a/arch/x86/kernel/sev.c +++ b/arch/x86/kernel/sev.c @@ -1752,7 +1752,10 @@ static enum es_result vc_handle_exitcode(struct es_em_ctxt *ctxt, struct ghcb *ghcb, unsigned long exit_code) { - enum es_result result; + enum es_result result = vc_check_opcode_bytes(ctxt, exit_code); + + if (result != ES_OK) + return result; switch (exit_code) { case SVM_EXIT_READ_DR7: From 8b4e2d8976b6c93b3786ced79f1c55d5d7b38737 Mon Sep 17 00:00:00 2001 From: Stephan Gerhold Date: Wed, 3 Jan 2024 22:10:05 +0100 Subject: [PATCH 118/707] pmdomain: core: Scale down parent/child performance states in reverse order Power domains might have parent domains assigned that are automatically managed by the PM domain core. In particular, parent domains are automatically powered on/off and setting performance states on child domains are propagated to parent domains (e.g. using an OPP table from the device tree). Currently the parent performance state is always adjusted before the performance state of the child domain, which is a problem for some cases when scaling down the performance state. More exactly, it may lead to that the parent domain could run in a lower performance state, than what is required by the child domain. To fix the behaviour, let's differentiate between scaling up/down and adjust the order of operations: - When scaling up, parent domains should be adjusted before the child domain. In case of an error, the rollback happens in reverse order. - When scaling down, parent domains should be adjusted after the child domain, in reverse order, just as if we would rollback scaling up. In case of an error, the rollback happens in normal order (just as if we would normally scale up). Signed-off-by: Stephan Gerhold Link: https://lore.kernel.org/r/20240103-genpd-perf-order-v2-1-eeecfc55624b@gerhold.net Tested-by: Ulf Hansson Signed-off-by: Ulf Hansson --- drivers/pmdomain/core.c | 124 +++++++++++++++++++++++++--------------- 1 file changed, 77 insertions(+), 47 deletions(-) diff --git a/drivers/pmdomain/core.c b/drivers/pmdomain/core.c index a1f6cba3ae6c86..fec9dc6ab82863 100644 --- a/drivers/pmdomain/core.c +++ b/drivers/pmdomain/core.c @@ -311,72 +311,102 @@ static int genpd_xlate_performance_state(struct generic_pm_domain *genpd, } static int _genpd_set_performance_state(struct generic_pm_domain *genpd, - unsigned int state, int depth) + unsigned int state, int depth); + +static void _genpd_rollback_parent_state(struct gpd_link *link, int depth) { - struct generic_pm_domain *parent; - struct gpd_link *link; - int parent_state, ret; + struct generic_pm_domain *parent = link->parent; + int parent_state; - if (state == genpd->performance_state) - return 0; + genpd_lock_nested(parent, depth + 1); - /* Propagate to parents of genpd */ - list_for_each_entry(link, &genpd->child_links, child_node) { - parent = link->parent; + parent_state = link->prev_performance_state; + link->performance_state = parent_state; - /* Find parent's performance state */ - ret = genpd_xlate_performance_state(genpd, parent, state); - if (unlikely(ret < 0)) - goto err; + parent_state = _genpd_reeval_performance_state(parent, parent_state); + if (_genpd_set_performance_state(parent, parent_state, depth + 1)) { + pr_err("%s: Failed to roll back to %d performance state\n", + parent->name, parent_state); + } - parent_state = ret; + genpd_unlock(parent); +} - genpd_lock_nested(parent, depth + 1); +static int _genpd_set_parent_state(struct generic_pm_domain *genpd, + struct gpd_link *link, + unsigned int state, int depth) +{ + struct generic_pm_domain *parent = link->parent; + int parent_state, ret; - link->prev_performance_state = link->performance_state; - link->performance_state = parent_state; - parent_state = _genpd_reeval_performance_state(parent, - parent_state); - ret = _genpd_set_performance_state(parent, parent_state, depth + 1); - if (ret) - link->performance_state = link->prev_performance_state; + /* Find parent's performance state */ + ret = genpd_xlate_performance_state(genpd, parent, state); + if (unlikely(ret < 0)) + return ret; - genpd_unlock(parent); + parent_state = ret; - if (ret) - goto err; - } + genpd_lock_nested(parent, depth + 1); - if (genpd->set_performance_state) { - ret = genpd->set_performance_state(genpd, state); - if (ret) - goto err; - } + link->prev_performance_state = link->performance_state; + link->performance_state = parent_state; - genpd->performance_state = state; - return 0; + parent_state = _genpd_reeval_performance_state(parent, parent_state); + ret = _genpd_set_performance_state(parent, parent_state, depth + 1); + if (ret) + link->performance_state = link->prev_performance_state; -err: - /* Encountered an error, lets rollback */ - list_for_each_entry_continue_reverse(link, &genpd->child_links, - child_node) { - parent = link->parent; + genpd_unlock(parent); - genpd_lock_nested(parent, depth + 1); + return ret; +} + +static int _genpd_set_performance_state(struct generic_pm_domain *genpd, + unsigned int state, int depth) +{ + struct gpd_link *link = NULL; + int ret; + + if (state == genpd->performance_state) + return 0; - parent_state = link->prev_performance_state; - link->performance_state = parent_state; + /* When scaling up, propagate to parents first in normal order */ + if (state > genpd->performance_state) { + list_for_each_entry(link, &genpd->child_links, child_node) { + ret = _genpd_set_parent_state(genpd, link, state, depth); + if (ret) + goto rollback_parents_up; + } + } - parent_state = _genpd_reeval_performance_state(parent, - parent_state); - if (_genpd_set_performance_state(parent, parent_state, depth + 1)) { - pr_err("%s: Failed to roll back to %d performance state\n", - parent->name, parent_state); + if (genpd->set_performance_state) { + ret = genpd->set_performance_state(genpd, state); + if (ret) { + if (link) + goto rollback_parents_up; + return ret; } + } - genpd_unlock(parent); + /* When scaling down, propagate to parents last in reverse order */ + if (state < genpd->performance_state) { + list_for_each_entry_reverse(link, &genpd->child_links, child_node) { + ret = _genpd_set_parent_state(genpd, link, state, depth); + if (ret) + goto rollback_parents_down; + } } + genpd->performance_state = state; + return 0; + +rollback_parents_up: + list_for_each_entry_continue_reverse(link, &genpd->child_links, child_node) + _genpd_rollback_parent_state(link, depth); + return ret; +rollback_parents_down: + list_for_each_entry_continue(link, &genpd->child_links, child_node) + _genpd_rollback_parent_state(link, depth); return ret; } From f55fcdb06f529c5031ada7edd5ede63dfdcc4a54 Mon Sep 17 00:00:00 2001 From: Jay Date: Tue, 9 Jan 2024 15:29:27 +0800 Subject: [PATCH 119/707] fs: fix a typo in attr.c The word "filesytem" should be "filesystem" Signed-off-by: Jay Link: https://lore.kernel.org/r/20240109072927.29626-1-merqqcury@gmail.com Signed-off-by: Christian Brauner --- fs/attr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/attr.c b/fs/attr.c index 5a13f0c8495fde..49d23b5dbab4b9 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -352,7 +352,7 @@ int may_setattr(struct mnt_idmap *idmap, struct inode *inode, EXPORT_SYMBOL(may_setattr); /** - * notify_change - modify attributes of a filesytem object + * notify_change - modify attributes of a filesystem object * @idmap: idmap of the mount the inode was found from * @dentry: object affected * @attr: new attributes From a121e297aac51c3ddb3f1f9aea58d4289aea8bc8 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Mon, 8 Jan 2024 18:20:40 +0100 Subject: [PATCH 120/707] fs: Wrong function name in comment This comment refers to function mark_buffer_inode_dirty(), but the function is actually called mark_buffer_dirty_inode(), so fix the comment. Signed-off-by: Andreas Gruenbacher Link: https://lore.kernel.org/r/20240108172040.178173-1-agruenba@redhat.com Reviewed-by: Matthew Wilcox (Oracle) Signed-off-by: Christian Brauner --- fs/buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/buffer.c b/fs/buffer.c index d3bcf601d3e5a5..dcafee512089a2 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -464,7 +464,7 @@ EXPORT_SYMBOL(mark_buffer_async_write); * a successful fsync(). For example, ext2 indirect blocks need to be * written back and waited upon before fsync() returns. * - * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(), + * The functions mark_buffer_dirty_inode(), fsync_inode_buffers(), * inode_has_buffers() and invalidate_inode_buffers() are provided for the * management of a list of dependent buffers at ->i_mapping->i_private_list. * From 30c45816e23523cd4527a8d20bf20c7d6bfd5a16 Mon Sep 17 00:00:00 2001 From: David Disseldorp Date: Thu, 11 Jan 2024 17:22:40 +1100 Subject: [PATCH 121/707] initramfs: remove duplicate built-in __initramfs_start unpacking If initrd_start cpio extraction fails, CONFIG_BLK_DEV_RAM triggers fallback to initrd.image handling via populate_initrd_image(). The populate_initrd_image() call follows successful extraction of any built-in cpio archive at __initramfs_start, but currently performs built-in archive extraction a second time. Prior to commit b2a74d5f9d446 ("initramfs: remove clean_rootfs"), the second built-in initramfs unpack call was used to repopulate entries removed by clean_rootfs(), but it's no longer necessary now the contents of the previous extraction are retained. Signed-off-by: David Disseldorp Link: https://lore.kernel.org/r/20240111062240.9362-1-ddiss@suse.de Reviewed-by: Christoph Hellwig Signed-off-by: Christian Brauner --- init/initramfs.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/init/initramfs.c b/init/initramfs.c index 76deb48c38cb16..d3c623dde01a88 100644 --- a/init/initramfs.c +++ b/init/initramfs.c @@ -679,8 +679,6 @@ static void __init populate_initrd_image(char *err) struct file *file; loff_t pos = 0; - unpack_to_rootfs(__initramfs_start, __initramfs_size); - printk(KERN_INFO "rootfs image is not initramfs (%s); looks like an initrd\n", err); file = filp_open("/initrd.image", O_WRONLY | O_CREAT, 0700); From ce2128e96b51c78732010fc79763d7a7141259e2 Mon Sep 17 00:00:00 2001 From: Wen Yang Date: Wed, 10 Jan 2024 23:47:40 +0800 Subject: [PATCH 122/707] eventfd: add a BUILD_BUG_ON() to ensure consistency between EFD_SEMAPHORE and the uapi introduce a BUILD_BUG_ON to check that the EFD_SEMAPHORE is equal to its definition in the uapi file, just like EFD_CLOEXEC and EFD_NONBLOCK. Signed-off-by: Wen Yang Link: https://lore.kernel.org/r/tencent_0BAA2DEAF9208D49987457E6583F9BE79507@qq.com Cc: Alexander Viro Cc: Christian Brauner Cc: Jens Axboe Cc: Jan Kara Cc: Cc: Signed-off-by: Christian Brauner --- fs/eventfd.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/eventfd.c b/fs/eventfd.c index ad8186d47ba760..0252b71099fbca 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -383,6 +383,7 @@ static int do_eventfd(unsigned int count, int flags) /* Check the EFD_* constants for consistency. */ BUILD_BUG_ON(EFD_CLOEXEC != O_CLOEXEC); BUILD_BUG_ON(EFD_NONBLOCK != O_NONBLOCK); + BUILD_BUG_ON(EFD_SEMAPHORE != (1 << 0)); if (flags & ~EFD_FLAGS_SET) return -EINVAL; From 4148bf4c5e6dc0932e3d4649047b203e9d554d3c Mon Sep 17 00:00:00 2001 From: Kunwu Chan Date: Tue, 16 Jan 2024 17:11:37 +0800 Subject: [PATCH 123/707] buffer: Use KMEM_CACHE instead of kmem_cache_create() Use the new KMEM_CACHE() macro instead of direct kmem_cache_create to simplify the creation of SLAB caches. Signed-off-by: Kunwu Chan Link: https://lore.kernel.org/r/20240116091137.92375-1-chentao@kylinos.cn Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/buffer.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/fs/buffer.c b/fs/buffer.c index dcafee512089a2..b55dea034a5d83 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -3121,12 +3121,8 @@ void __init buffer_init(void) unsigned long nrpages; int ret; - bh_cachep = kmem_cache_create("buffer_head", - sizeof(struct buffer_head), 0, - (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC| - SLAB_MEM_SPREAD), - NULL); - + bh_cachep = KMEM_CACHE(buffer_head, + SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD); /* * Limit the bh occupancy to 10% of ZONE_NORMAL */ From c2f1af4e033e17f9332e47e8fc3266afd95ad548 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Tue, 16 Jan 2024 15:53:35 +0800 Subject: [PATCH 124/707] fs: improve dump_mapping() robustness We met a kernel crash issue when running stress-ng testing, and the system crashes when printing the dentry name in dump_mapping(). Unable to handle kernel NULL pointer dereference at virtual address 0000000000000000 pc : dentry_name+0xd8/0x224 lr : pointer+0x22c/0x370 sp : ffff800025f134c0 ...... Call trace: dentry_name+0xd8/0x224 pointer+0x22c/0x370 vsnprintf+0x1ec/0x730 vscnprintf+0x2c/0x60 vprintk_store+0x70/0x234 vprintk_emit+0xe0/0x24c vprintk_default+0x3c/0x44 vprintk_func+0x84/0x2d0 printk+0x64/0x88 __dump_page+0x52c/0x530 dump_page+0x14/0x20 set_migratetype_isolate+0x110/0x224 start_isolate_page_range+0xc4/0x20c offline_pages+0x124/0x474 memory_block_offline+0x44/0xf4 memory_subsys_offline+0x3c/0x70 device_offline+0xf0/0x120 ...... The root cause is that, one thread is doing page migration, and we will use the target page's ->mapping field to save 'anon_vma' pointer between page unmap and page move, and now the target page is locked and refcount is 1. Currently, there is another stress-ng thread performing memory hotplug, attempting to offline the target page that is being migrated. It discovers that the refcount of this target page is 1, preventing the offline operation, thus proceeding to dump the page. However, page_mapping() of the target page may return an incorrect file mapping to crash the system in dump_mapping(), since the target page->mapping only saves 'anon_vma' pointer without setting PAGE_MAPPING_ANON flag. The page migration issue has been fixed by commit d1adb25df711 ("mm: migrate: fix getting incorrect page mapping during page migration"). In addition, Matthew suggested we should also improve dump_mapping()'s robustness to resilient against the kernel crash [1]. With checking the 'dentry.parent' and 'dentry.d_name.name' used by dentry_name(), I can see dump_mapping() will output the invalid dentry instead of crashing the system when this issue is reproduced again. [12211.189128] page:fffff7de047741c0 refcount:1 mapcount:0 mapping:ffff989117f55ea0 index:0x1 pfn:0x211dd07 [12211.189144] aops:0x0 ino:1 invalid dentry:74786574206e6870 [12211.189148] flags: 0x57ffffc0000001(locked|node=1|zone=2|lastcpupid=0x1fffff) [12211.189150] page_type: 0xffffffff() [12211.189153] raw: 0057ffffc0000001 0000000000000000 dead000000000122 ffff989117f55ea0 [12211.189154] raw: 0000000000000001 0000000000000001 00000001ffffffff 0000000000000000 [12211.189155] page dumped because: unmovable page [1] https://lore.kernel.org/all/ZXxn%2F0oixJxxAnpF@casper.infradead.org/ Suggested-by: Matthew Wilcox Signed-off-by: Baolin Wang Link: https://lore.kernel.org/r/937ab1f87328516821d39be672b6bc18861d9d3e.1705391420.git.baolin.wang@linux.alibaba.com Signed-off-by: Christian Brauner --- fs/inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/inode.c b/fs/inode.c index 91048c4c9c9e7d..6d0d5423036380 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -588,7 +588,8 @@ void dump_mapping(const struct address_space *mapping) } dentry_ptr = container_of(dentry_first, struct dentry, d_u.d_alias); - if (get_kernel_nofault(dentry, dentry_ptr)) { + if (get_kernel_nofault(dentry, dentry_ptr) || + !dentry.d_parent || !dentry.d_name.name) { pr_warn("aops:%ps ino:%lx invalid dentry:%px\n", a_ops, ino, dentry_ptr); return; From 352f0ba021364519f95aa378dcca18cf7600fba1 Mon Sep 17 00:00:00 2001 From: Hu Yadi Date: Fri, 12 Jan 2024 15:40:59 +0800 Subject: [PATCH 125/707] selftests/filesystems:fix build error in overlayfs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit One build issue comes up due to both mount.h included dev_in_maps.c In file included from dev_in_maps.c:10: /usr/include/sys/mount.h:35:3: error: expected identifier before numeric constant 35 | MS_RDONLY = 1, /* Mount read-only. */ | ^~~~~~~~~ In file included from dev_in_maps.c:13: Remove one of them to solve conflict, another error comes up: dev_in_maps.c:170:6: error: implicit declaration of function ‘mount’ [-Werror=implicit-function-declaration] 170 | if (mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL) == -1) { | ^~~~~ cc1: all warnings being treated as errors and then , add sys_mount definition to solve it After both above, dev_in_maps.c can be built correctly on my mache(gcc 10.2,glibc-2.32,kernel-5.10) Signed-off-by: Hu Yadi Link: https://lore.kernel.org/r/20240112074059.29673-1-hu.yadi@h3c.com Acked-by: Andrei Vagin Signed-off-by: Christian Brauner --- .../selftests/filesystems/overlayfs/dev_in_maps.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/filesystems/overlayfs/dev_in_maps.c b/tools/testing/selftests/filesystems/overlayfs/dev_in_maps.c index e19ab0e8570913..759f86e7d263e4 100644 --- a/tools/testing/selftests/filesystems/overlayfs/dev_in_maps.c +++ b/tools/testing/selftests/filesystems/overlayfs/dev_in_maps.c @@ -10,7 +10,6 @@ #include #include #include -#include #include #include #include @@ -32,7 +31,11 @@ static int sys_fsmount(int fd, unsigned int flags, unsigned int attr_flags) { return syscall(__NR_fsmount, fd, flags, attr_flags); } - +static int sys_mount(const char *src, const char *tgt, const char *fst, + unsigned long flags, const void *data) +{ + return syscall(__NR_mount, src, tgt, fst, flags, data); +} static int sys_move_mount(int from_dfd, const char *from_pathname, int to_dfd, const char *to_pathname, unsigned int flags) @@ -166,8 +169,7 @@ int main(int argc, char **argv) ksft_test_result_skip("unable to create a new mount namespace\n"); return 1; } - - if (mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL) == -1) { + if (sys_mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL) == -1) { pr_perror("mount"); return 1; } From b872e2a5ac3456dd1a3229657e1ea46aff06d2f1 Mon Sep 17 00:00:00 2001 From: "Hu.Yadi" Date: Thu, 11 Jan 2024 19:32:29 +0800 Subject: [PATCH 126/707] selftests/move_mount_set_group:Make tests build with old libc MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace SYS_ with __NR_. Using the __NR_ notation, provided by UAPI, is useful to build tests on systems without the SYS_ definitions. Replace SYS_move_mount with __NR_move_mount Similar changes: commit 87129ef13603 ("selftests/landlock: Make tests build with old libc") Acked-by: Mickaël Salaün Signed-off-by: Hu.Yadi Link: https://lore.kernel.org/r/20240111113229.10820-1-hu.yadi@h3c.com Reviewed-by: Berlin Suggested-by: Jiao Signed-off-by: Christian Brauner --- .../move_mount_set_group/move_mount_set_group_test.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/move_mount_set_group/move_mount_set_group_test.c b/tools/testing/selftests/move_mount_set_group/move_mount_set_group_test.c index 50ed5d475dd131..bcf51d785a3712 100644 --- a/tools/testing/selftests/move_mount_set_group/move_mount_set_group_test.c +++ b/tools/testing/selftests/move_mount_set_group/move_mount_set_group_test.c @@ -218,7 +218,7 @@ static bool move_mount_set_group_supported(void) if (mount(NULL, SET_GROUP_FROM, NULL, MS_SHARED, 0)) return -1; - ret = syscall(SYS_move_mount, AT_FDCWD, SET_GROUP_FROM, + ret = syscall(__NR_move_mount, AT_FDCWD, SET_GROUP_FROM, AT_FDCWD, SET_GROUP_TO, MOVE_MOUNT_SET_GROUP); umount2("/tmp", MNT_DETACH); @@ -363,7 +363,7 @@ TEST_F(move_mount_set_group, complex_sharing_copying) CLONE_VM | CLONE_FILES); ASSERT_GT(pid, 0); ASSERT_EQ(wait_for_pid(pid), 0); - ASSERT_EQ(syscall(SYS_move_mount, ca_from.mntfd, "", + ASSERT_EQ(syscall(__NR_move_mount, ca_from.mntfd, "", ca_to.mntfd, "", MOVE_MOUNT_SET_GROUP | MOVE_MOUNT_F_EMPTY_PATH | MOVE_MOUNT_T_EMPTY_PATH), 0); From 74ad68a64b60029c9e5dc78f63ae3fffc9a5569b Mon Sep 17 00:00:00 2001 From: Rich Felker Date: Mon, 31 Aug 2020 11:32:08 -0400 Subject: [PATCH 127/707] vfs: add RWF_NOAPPEND flag for pwritev2 The pwrite function, originally defined by POSIX (thus the "p"), is defined to ignore O_APPEND and write at the offset passed as its argument. However, historically Linux honored O_APPEND if set and ignored the offset. This cannot be changed due to stability policy, but is documented in the man page as a bug. Now that there's a pwritev2 syscall providing a superset of the pwrite functionality that has a flags argument, the conforming behavior can be offered to userspace via a new flag. Since pwritev2 checks flag validity (in kiocb_set_rw_flags) and reports unknown ones with EOPNOTSUPP, callers will not get wrong behavior on old kernels that don't support the new flag; the error is reported and the caller can decide how to handle it. Signed-off-by: Rich Felker Link: https://lore.kernel.org/r/20200831153207.GO3265@brightrain.aerifal.cx Reviewed-by: Jann Horn Signed-off-by: Christian Brauner --- include/linux/fs.h | 8 ++++++++ include/uapi/linux/fs.h | 5 ++++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/include/linux/fs.h b/include/linux/fs.h index ed5966a7049512..4f7cfda29143e2 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3335,6 +3335,8 @@ static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags) return 0; if (unlikely(flags & ~RWF_SUPPORTED)) return -EOPNOTSUPP; + if (unlikely((flags & RWF_APPEND) && (flags & RWF_NOAPPEND))) + return -EINVAL; if (flags & RWF_NOWAIT) { if (!(ki->ki_filp->f_mode & FMODE_NOWAIT)) @@ -3345,6 +3347,12 @@ static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags) if (flags & RWF_SYNC) kiocb_flags |= IOCB_DSYNC; + if ((flags & RWF_NOAPPEND) && (ki->ki_flags & IOCB_APPEND)) { + if (IS_APPEND(file_inode(ki->ki_filp))) + return -EPERM; + ki->ki_flags &= ~IOCB_APPEND; + } + ki->ki_flags |= kiocb_flags; return 0; } diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index 48ad69f7722e1a..2203d3194b91a7 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -301,9 +301,12 @@ typedef int __bitwise __kernel_rwf_t; /* per-IO O_APPEND */ #define RWF_APPEND ((__force __kernel_rwf_t)0x00000010) +/* per-IO negation of O_APPEND */ +#define RWF_NOAPPEND ((__force __kernel_rwf_t)0x00000020) + /* mask of flags supported by the kernel */ #define RWF_SUPPORTED (RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\ - RWF_APPEND) + RWF_APPEND | RWF_NOAPPEND) /* Pagemap ioctl */ #define PAGEMAP_SCAN _IOWR('f', 16, struct pm_scan_arg) From 172827cc44e92f3416392a43e4d219cbcf4d012c Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 19 Jan 2024 04:33:39 +0800 Subject: [PATCH 128/707] writeback: move wb_wakeup_delayed defination to fs-writeback.c The wb_wakeup_delayed is only used in fs-writeback.c. Move it to fs-writeback.c after defination of wb_wakeup and make it static. Signed-off-by: Kemeng Shi Link: https://lore.kernel.org/r/20240118203339.764093-1-shikemeng@huaweicloud.com Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/fs-writeback.c | 25 +++++++++++++++++++++++++ include/linux/backing-dev.h | 1 - mm/backing-dev.c | 25 ------------------------- 3 files changed, 25 insertions(+), 26 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 3d84fcc471c600..e4f17c53ddfcf3 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -141,6 +141,31 @@ static void wb_wakeup(struct bdi_writeback *wb) spin_unlock_irq(&wb->work_lock); } +/* + * This function is used when the first inode for this wb is marked dirty. It + * wakes-up the corresponding bdi thread which should then take care of the + * periodic background write-out of dirty inodes. Since the write-out would + * starts only 'dirty_writeback_interval' centisecs from now anyway, we just + * set up a timer which wakes the bdi thread up later. + * + * Note, we wouldn't bother setting up the timer, but this function is on the + * fast-path (used by '__mark_inode_dirty()'), so we save few context switches + * by delaying the wake-up. + * + * We have to be careful not to postpone flush work if it is scheduled for + * earlier. Thus we use queue_delayed_work(). + */ +static void wb_wakeup_delayed(struct bdi_writeback *wb) +{ + unsigned long timeout; + + timeout = msecs_to_jiffies(dirty_writeback_interval * 10); + spin_lock_irq(&wb->work_lock); + if (test_bit(WB_registered, &wb->state)) + queue_delayed_work(bdi_wq, &wb->dwork, timeout); + spin_unlock_irq(&wb->work_lock); +} + static void finish_writeback_work(struct bdi_writeback *wb, struct wb_writeback_work *work) { diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 1a97277f99b1b8..8e7af9a03b41dd 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -38,7 +38,6 @@ struct backing_dev_info *bdi_alloc(int node_id); void wb_start_background_writeback(struct bdi_writeback *wb); void wb_workfn(struct work_struct *work); -void wb_wakeup_delayed(struct bdi_writeback *wb); void wb_wait_for_completion(struct wb_completion *done); diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 1e3447bccdb14d..039dc74b505a85 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -372,31 +372,6 @@ static int __init default_bdi_init(void) } subsys_initcall(default_bdi_init); -/* - * This function is used when the first inode for this wb is marked dirty. It - * wakes-up the corresponding bdi thread which should then take care of the - * periodic background write-out of dirty inodes. Since the write-out would - * starts only 'dirty_writeback_interval' centisecs from now anyway, we just - * set up a timer which wakes the bdi thread up later. - * - * Note, we wouldn't bother setting up the timer, but this function is on the - * fast-path (used by '__mark_inode_dirty()'), so we save few context switches - * by delaying the wake-up. - * - * We have to be careful not to postpone flush work if it is scheduled for - * earlier. Thus we use queue_delayed_work(). - */ -void wb_wakeup_delayed(struct bdi_writeback *wb) -{ - unsigned long timeout; - - timeout = msecs_to_jiffies(dirty_writeback_interval * 10); - spin_lock_irq(&wb->work_lock); - if (test_bit(WB_registered, &wb->state)) - queue_delayed_work(bdi_wq, &wb->dwork, timeout); - spin_unlock_irq(&wb->work_lock); -} - static void wb_update_bandwidth_workfn(struct work_struct *work) { struct bdi_writeback *wb = container_of(to_delayed_work(work), From 19e062e48b33f9f52dbb6f87def79b709f407260 Mon Sep 17 00:00:00 2001 From: Nikita Zhandarovich Date: Fri, 19 Jan 2024 07:39:06 -0800 Subject: [PATCH 129/707] do_sys_name_to_handle(): use kzalloc() to fix kernel-infoleak syzbot identified a kernel information leak vulnerability in do_sys_name_to_handle() and issued the following report [1]. [1] "BUG: KMSAN: kernel-infoleak in instrument_copy_to_user include/linux/instrumented.h:114 [inline] BUG: KMSAN: kernel-infoleak in _copy_to_user+0xbc/0x100 lib/usercopy.c:40 instrument_copy_to_user include/linux/instrumented.h:114 [inline] _copy_to_user+0xbc/0x100 lib/usercopy.c:40 copy_to_user include/linux/uaccess.h:191 [inline] do_sys_name_to_handle fs/fhandle.c:73 [inline] __do_sys_name_to_handle_at fs/fhandle.c:112 [inline] __se_sys_name_to_handle_at+0x949/0xb10 fs/fhandle.c:94 __x64_sys_name_to_handle_at+0xe4/0x140 fs/fhandle.c:94 ... Uninit was created at: slab_post_alloc_hook+0x129/0xa70 mm/slab.h:768 slab_alloc_node mm/slub.c:3478 [inline] __kmem_cache_alloc_node+0x5c9/0x970 mm/slub.c:3517 __do_kmalloc_node mm/slab_common.c:1006 [inline] __kmalloc+0x121/0x3c0 mm/slab_common.c:1020 kmalloc include/linux/slab.h:604 [inline] do_sys_name_to_handle fs/fhandle.c:39 [inline] __do_sys_name_to_handle_at fs/fhandle.c:112 [inline] __se_sys_name_to_handle_at+0x441/0xb10 fs/fhandle.c:94 __x64_sys_name_to_handle_at+0xe4/0x140 fs/fhandle.c:94 ... Bytes 18-19 of 20 are uninitialized Memory access of size 20 starts at ffff888128a46380 Data copied to user address 0000000020000240" Per Chuck Lever's suggestion, use kzalloc() instead of kmalloc() to solve the problem. Fixes: 990d6c2d7aee ("vfs: Add name to file handle conversion support") Suggested-by: Chuck Lever III Reported-and-tested-by: Signed-off-by: Nikita Zhandarovich Link: https://lore.kernel.org/r/20240119153906.4367-1-n.zhandarovich@fintech.ru Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/fhandle.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/fhandle.c b/fs/fhandle.c index 18b3ba8dc8ead7..57a12614addfd4 100644 --- a/fs/fhandle.c +++ b/fs/fhandle.c @@ -36,7 +36,7 @@ static long do_sys_name_to_handle(const struct path *path, if (f_handle.handle_bytes > MAX_HANDLE_SZ) return -EINVAL; - handle = kmalloc(sizeof(struct file_handle) + f_handle.handle_bytes, + handle = kzalloc(sizeof(struct file_handle) + f_handle.handle_bytes, GFP_KERNEL); if (!handle) return -ENOMEM; From aafef05b6399a2c4ce1c457d7fadf19443cdda2c Mon Sep 17 00:00:00 2001 From: Konrad Dybcio Date: Wed, 27 Dec 2023 02:20:32 +0100 Subject: [PATCH 130/707] pmdomain: qcom: rpmpd: Keep one RPM handle for all RPMPDs For no apparent reason (as there's just one RPM per SoC), all RPMPDs currently store a copy of a pointer to smd_rpm. Introduce a single, global one to save up on space in each definition. bloat-o-meter reports: Total: Before=92010, After=91062, chg -1.03% Signed-off-by: Konrad Dybcio Link: https://lore.kernel.org/r/20231227-topic-rpmpd_cleanup-v1-1-860ab141b076@linaro.org Signed-off-by: Ulf Hansson --- drivers/pmdomain/qcom/rpmpd.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/drivers/pmdomain/qcom/rpmpd.c b/drivers/pmdomain/qcom/rpmpd.c index 7796d65f96e8cf..90b62767f9d028 100644 --- a/drivers/pmdomain/qcom/rpmpd.c +++ b/drivers/pmdomain/qcom/rpmpd.c @@ -16,6 +16,8 @@ #define domain_to_rpmpd(domain) container_of(domain, struct rpmpd, pd) +static struct qcom_smd_rpm *rpmpd_smd_rpm; + /* Resource types: * RPMPD_X is X encoded as a little-endian, lower-case, ASCII string */ #define RPMPD_SMPA 0x61706d73 @@ -54,7 +56,6 @@ struct rpmpd { bool enabled; const int res_type; const int res_id; - struct qcom_smd_rpm *rpm; unsigned int max_state; __le32 key; bool state_synced; @@ -879,7 +880,7 @@ static int rpmpd_send_enable(struct rpmpd *pd, bool enable) .value = cpu_to_le32(enable), }; - return qcom_rpm_smd_write(pd->rpm, QCOM_SMD_RPM_ACTIVE_STATE, + return qcom_rpm_smd_write(rpmpd_smd_rpm, QCOM_SMD_RPM_ACTIVE_STATE, pd->res_type, pd->res_id, &req, sizeof(req)); } @@ -891,7 +892,7 @@ static int rpmpd_send_corner(struct rpmpd *pd, int state, unsigned int corner) .value = cpu_to_le32(corner), }; - return qcom_rpm_smd_write(pd->rpm, state, pd->res_type, pd->res_id, + return qcom_rpm_smd_write(rpmpd_smd_rpm, state, pd->res_type, pd->res_id, &req, sizeof(req)); }; @@ -1004,12 +1005,11 @@ static int rpmpd_probe(struct platform_device *pdev) int i; size_t num; struct genpd_onecell_data *data; - struct qcom_smd_rpm *rpm; struct rpmpd **rpmpds; const struct rpmpd_desc *desc; - rpm = dev_get_drvdata(pdev->dev.parent); - if (!rpm) { + rpmpd_smd_rpm = dev_get_drvdata(pdev->dev.parent); + if (!rpmpd_smd_rpm) { dev_err(&pdev->dev, "Unable to retrieve handle to RPM\n"); return -ENODEV; } @@ -1039,7 +1039,6 @@ static int rpmpd_probe(struct platform_device *pdev) continue; } - rpmpds[i]->rpm = rpm; rpmpds[i]->max_state = desc->max_state; rpmpds[i]->pd.power_off = rpmpd_power_off; rpmpds[i]->pd.power_on = rpmpd_power_on; From 2afe7095791a48696940c8921c7b67c442882317 Mon Sep 17 00:00:00 2001 From: Konrad Dybcio Date: Wed, 27 Dec 2023 16:18:54 +0100 Subject: [PATCH 131/707] pmdomain: core: Print a message when unused power domains are disabled In a similar spirit to commit 12ca59b91d04 ("clk: Print an info line before disabling unused clocks"), print the message in both ignore AND cleanup cases to better inform the user (and more importantly, the developer) when it happens. Signed-off-by: Konrad Dybcio Link: https://lore.kernel.org/r/20231227-topic-pmdomain_spam-v1-1-ff0410086b36@linaro.org Signed-off-by: Ulf Hansson --- drivers/pmdomain/core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/pmdomain/core.c b/drivers/pmdomain/core.c index fec9dc6ab82863..30b9be61bab18d 100644 --- a/drivers/pmdomain/core.c +++ b/drivers/pmdomain/core.c @@ -1130,6 +1130,7 @@ static int __init genpd_power_off_unused(void) return 0; } + pr_info("genpd: Disabling unused power domains\n"); mutex_lock(&gpd_list_lock); list_for_each_entry(genpd, &gpd_list, gpd_list_node) From d901fa88145178db7bfedfbdc13924b564c94d50 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Mon, 22 Jan 2024 14:44:41 +0100 Subject: [PATCH 132/707] pmdomain: renesas: rcar-gen4-sysc: Remove unneeded includes The R-Car V3U System Controller (SYSC) driver no longer needs these includes since the factoring out of the common R-Car Gen4 SYSC driver in commit e62906d6315f652b ("soc: renesas: rcar-gen4-sysc: Introduce R-Car Gen4 SYSC driver"). The R-Car S4-8 and V4H SYSC drivers never needed these includes, as these drivers always used the common R-Car Gen4 SYSC driver. Signed-off-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/5b440f84ab8b52499ab307c84154dcbc0f41d1d7.1705931035.git.geert+renesas@glider.be Signed-off-by: Ulf Hansson --- drivers/pmdomain/renesas/r8a779a0-sysc.c | 12 ------------ drivers/pmdomain/renesas/r8a779f0-sysc.c | 12 ------------ drivers/pmdomain/renesas/r8a779g0-sysc.c | 12 ------------ 3 files changed, 36 deletions(-) diff --git a/drivers/pmdomain/renesas/r8a779a0-sysc.c b/drivers/pmdomain/renesas/r8a779a0-sysc.c index 04f1bc322ae7b6..54cdf250f7c2d1 100644 --- a/drivers/pmdomain/renesas/r8a779a0-sysc.c +++ b/drivers/pmdomain/renesas/r8a779a0-sysc.c @@ -5,19 +5,7 @@ * Copyright (C) 2020 Renesas Electronics Corp. */ -#include -#include -#include -#include -#include -#include #include -#include -#include -#include -#include -#include -#include #include diff --git a/drivers/pmdomain/renesas/r8a779f0-sysc.c b/drivers/pmdomain/renesas/r8a779f0-sysc.c index 5602aa6bd7ed15..6ed13cd1cb249d 100644 --- a/drivers/pmdomain/renesas/r8a779f0-sysc.c +++ b/drivers/pmdomain/renesas/r8a779f0-sysc.c @@ -5,19 +5,7 @@ * Copyright (C) 2021 Renesas Electronics Corp. */ -#include -#include -#include -#include -#include -#include #include -#include -#include -#include -#include -#include -#include #include diff --git a/drivers/pmdomain/renesas/r8a779g0-sysc.c b/drivers/pmdomain/renesas/r8a779g0-sysc.c index b932eba1b8042d..249cf43af45b64 100644 --- a/drivers/pmdomain/renesas/r8a779g0-sysc.c +++ b/drivers/pmdomain/renesas/r8a779g0-sysc.c @@ -5,19 +5,7 @@ * Copyright (C) 2022 Renesas Electronics Corp. */ -#include -#include -#include -#include -#include -#include #include -#include -#include -#include -#include -#include -#include #include From 05170c8598f1edccc5948fe797a40125d55c4f65 Mon Sep 17 00:00:00 2001 From: Kunwu Chan Date: Thu, 18 Jan 2024 13:42:57 +0800 Subject: [PATCH 133/707] pmdomain: ti: Add a null pointer check to the omap_prm_domain_init devm_kasprintf() returns a pointer to dynamically allocated memory which can be NULL upon failure. Ensure the allocation was successful by checking the pointer validity. Signed-off-by: Kunwu Chan Link: https://lore.kernel.org/r/20240118054257.200814-1-chentao@kylinos.cn Signed-off-by: Ulf Hansson --- drivers/pmdomain/ti/omap_prm.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/pmdomain/ti/omap_prm.c b/drivers/pmdomain/ti/omap_prm.c index c2feae3a634caf..b8ceb3c2b81c25 100644 --- a/drivers/pmdomain/ti/omap_prm.c +++ b/drivers/pmdomain/ti/omap_prm.c @@ -695,6 +695,8 @@ static int omap_prm_domain_init(struct device *dev, struct omap_prm *prm) data = prm->data; name = devm_kasprintf(dev, GFP_KERNEL, "prm_%s", data->name); + if (!name) + return -ENOMEM; prmd->dev = dev; prmd->prm = prm; From 768c04ccb9013c0eab51dd31f63474ef8a7456db Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Fri, 19 Jan 2024 02:47:41 +0100 Subject: [PATCH 134/707] pmdomain: imx8mp-blk-ctrl: Error out if domains are missing in DT This driver assumes that domain->power_dev is non-NULL in its suspend/resume path. The assumption is valid, since all the devices that are being looked up here should be described in DT. In case they are not described in DT, because the DT is faulty, suspend/resume attempt would trigger NULL pointer dereference. To avoid this failure, check whether the power_dev assignment is not NULL right away in probe callback and fail early if it is. Signed-off-by: Marek Vasut Reviewed-by: Peng Fan Link: https://lore.kernel.org/r/20240119014807.268694-1-marex@denx.de Signed-off-by: Ulf Hansson --- drivers/pmdomain/imx/imx8m-blk-ctrl.c | 9 ++++++--- drivers/pmdomain/imx/imx8mp-blk-ctrl.c | 9 ++++++--- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/drivers/pmdomain/imx/imx8m-blk-ctrl.c b/drivers/pmdomain/imx/imx8m-blk-ctrl.c index 1341a707f61bcb..ca942d7929c2ba 100644 --- a/drivers/pmdomain/imx/imx8m-blk-ctrl.c +++ b/drivers/pmdomain/imx/imx8m-blk-ctrl.c @@ -258,11 +258,14 @@ static int imx8m_blk_ctrl_probe(struct platform_device *pdev) domain->power_dev = dev_pm_domain_attach_by_name(dev, data->gpc_name); - if (IS_ERR(domain->power_dev)) { - dev_err_probe(dev, PTR_ERR(domain->power_dev), + if (IS_ERR_OR_NULL(domain->power_dev)) { + if (!domain->power_dev) + ret = -ENODEV; + else + ret = PTR_ERR(domain->power_dev); + dev_err_probe(dev, ret, "failed to attach power domain \"%s\"\n", data->gpc_name); - ret = PTR_ERR(domain->power_dev); goto cleanup_pds; } diff --git a/drivers/pmdomain/imx/imx8mp-blk-ctrl.c b/drivers/pmdomain/imx/imx8mp-blk-ctrl.c index e3203eb6a02293..e488cf79b80070 100644 --- a/drivers/pmdomain/imx/imx8mp-blk-ctrl.c +++ b/drivers/pmdomain/imx/imx8mp-blk-ctrl.c @@ -687,11 +687,14 @@ static int imx8mp_blk_ctrl_probe(struct platform_device *pdev) domain->power_dev = dev_pm_domain_attach_by_name(dev, data->gpc_name); - if (IS_ERR(domain->power_dev)) { - dev_err_probe(dev, PTR_ERR(domain->power_dev), + if (IS_ERR_OR_NULL(domain->power_dev)) { + if (!domain->power_dev) + ret = -ENODEV; + else + ret = PTR_ERR(domain->power_dev); + dev_err_probe(dev, ret, "failed to attach power domain %s\n", data->gpc_name); - ret = PTR_ERR(domain->power_dev); goto cleanup_pds; } From 460fad3643ba3a2b4332f3f21c149ad35749d87c Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Mon, 15 Jan 2024 13:18:20 +0100 Subject: [PATCH 135/707] gfs2: Fix gfs2_drevalidate NULL pointer dereference Commit dd00aaeb3432 added an RCU-safe way of computing d_inode(parent) to gfs2_drevalidate() to support the LOOKUP_RCU flag, but then failed to convert one of the instances of d_inode(parent) to its RCU-safe replacement. This manifested as a NULL pointer dereference. Fix that. Reported-by: Al Viro Fixes: dd00aaeb3432 ("gfs2: Use GL_NOBLOCK flag for non-blocking lookups") Signed-off-by: Andreas Gruenbacher Reviewed-by: Andrew Price --- fs/gfs2/dentry.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c index 177f1f41f22545..c6483fb9862450 100644 --- a/fs/gfs2/dentry.c +++ b/fs/gfs2/dentry.c @@ -72,7 +72,7 @@ static int gfs2_drevalidate(struct dentry *dentry, unsigned int flags) goto out; } - error = gfs2_dir_check(d_inode(parent), &dentry->d_name, ip); + error = gfs2_dir_check(dinode, &dentry->d_name, ip); valid = inode ? !error : (error == -ENOENT); if (!had_lock) From 04b945e4cf81a12365f8207a4d34dbc81ba17413 Mon Sep 17 00:00:00 2001 From: Jeff Johnson Date: Thu, 21 Dec 2023 07:16:04 -0800 Subject: [PATCH 136/707] slimbus: qcom-ngd-ctrl: Make QMI message rules const Commit ff6d365898d4 ("soc: qcom: qmi: use const for struct qmi_elem_info") allows QMI message encoding/decoding rules to be const, so do that for qcom-ngd-ctrl.c. Signed-off-by: Jeff Johnson Signed-off-by: Srinivas Kandagatla --- drivers/slimbus/qcom-ngd-ctrl.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/slimbus/qcom-ngd-ctrl.c b/drivers/slimbus/qcom-ngd-ctrl.c index 77aa6d26476cd2..efeba8275a6691 100644 --- a/drivers/slimbus/qcom-ngd-ctrl.c +++ b/drivers/slimbus/qcom-ngd-ctrl.c @@ -220,7 +220,7 @@ struct slimbus_power_resp_msg_v01 { struct qmi_response_type_v01 resp; }; -static struct qmi_elem_info slimbus_select_inst_req_msg_v01_ei[] = { +static const struct qmi_elem_info slimbus_select_inst_req_msg_v01_ei[] = { { .data_type = QMI_UNSIGNED_4_BYTE, .elem_len = 1, @@ -262,7 +262,7 @@ static struct qmi_elem_info slimbus_select_inst_req_msg_v01_ei[] = { }, }; -static struct qmi_elem_info slimbus_select_inst_resp_msg_v01_ei[] = { +static const struct qmi_elem_info slimbus_select_inst_resp_msg_v01_ei[] = { { .data_type = QMI_STRUCT, .elem_len = 1, @@ -284,7 +284,7 @@ static struct qmi_elem_info slimbus_select_inst_resp_msg_v01_ei[] = { }, }; -static struct qmi_elem_info slimbus_power_req_msg_v01_ei[] = { +static const struct qmi_elem_info slimbus_power_req_msg_v01_ei[] = { { .data_type = QMI_UNSIGNED_4_BYTE, .elem_len = 1, @@ -324,7 +324,7 @@ static struct qmi_elem_info slimbus_power_req_msg_v01_ei[] = { }, }; -static struct qmi_elem_info slimbus_power_resp_msg_v01_ei[] = { +static const struct qmi_elem_info slimbus_power_resp_msg_v01_ei[] = { { .data_type = QMI_STRUCT, .elem_len = 1, From 53ddef135d3a80064b74964a08b0e0f3aed7c952 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20Danis?= Date: Mon, 22 Jan 2024 17:59:55 +0100 Subject: [PATCH 137/707] Bluetooth: mgmt: Fix limited discoverable off timeout MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit LIMITED_DISCOVERABLE flag is not reset from Class of Device and advertisement on limited discoverable timeout. This prevents to pass PTS test GAP/DISC/LIMM/BV-02-C Calling set_discoverable_sync as when the limited discovery is set correctly update the Class of Device and advertisement. Signed-off-by: Frédéric Danis Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/mgmt.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 173986f3405f7a..8c4493255f92ab 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -1045,6 +1045,8 @@ static void rpa_expired(struct work_struct *work) hci_cmd_sync_queue(hdev, rpa_expired_sync, NULL, NULL); } +static int set_discoverable_sync(struct hci_dev *hdev, void *data); + static void discov_off(struct work_struct *work) { struct hci_dev *hdev = container_of(work, struct hci_dev, @@ -1063,7 +1065,7 @@ static void discov_off(struct work_struct *work) hci_dev_clear_flag(hdev, HCI_DISCOVERABLE); hdev->discov_timeout = 0; - hci_update_discoverable(hdev); + hci_cmd_sync_queue(hdev, set_discoverable_sync, NULL, NULL); mgmt_new_settings(hdev); From 819faf7a243ef2771ba650dfc1b8a40f5a3d94ab Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Mon, 15 Jan 2024 22:10:44 +0100 Subject: [PATCH 138/707] gfs2: Pass FGP flags to gfs2_getbuf Replace gfs2_getbuf()'s create argument with a fgp_flags argument. Use the FGP_CREAT flag instead of the CREATE flag to indicate that new buffers should be created. In addition, when the FGP_NOWAIT flag is set and gfs2_getbuf() would sleep, -EAGAIN is returned instead. Signed-off-by: Andreas Gruenbacher Reviewed-by: Andrew Price --- fs/gfs2/bmap.c | 2 +- fs/gfs2/dir.c | 2 +- fs/gfs2/meta_io.c | 36 +++++++++++++++++++++++------------- fs/gfs2/meta_io.h | 2 +- 4 files changed, 26 insertions(+), 16 deletions(-) diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index d9ccfd27e4f11f..92945e5b764348 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -301,7 +301,7 @@ static void gfs2_metapath_ra(struct gfs2_glock *gl, __be64 *start, __be64 *end) if (!*t) continue; - rabh = gfs2_getbuf(gl, be64_to_cpu(*t), CREATE); + rabh = gfs2_getbuf(gl, be64_to_cpu(*t), FGP_CREAT); if (trylock_buffer(rabh)) { if (!buffer_uptodate(rabh)) { rabh->b_end_io = end_buffer_read_sync; diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index 560e4624c09f2d..518a7fb42df044 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -1500,7 +1500,7 @@ static void gfs2_dir_readahead(struct inode *inode, unsigned hsize, u32 index, if (blocknr == last) continue; - bh = gfs2_getbuf(gl, blocknr, 1); + bh = gfs2_getbuf(gl, blocknr, FGP_CREAT); if (trylock_buffer(bh)) { if (buffer_uptodate(bh)) { unlock_buffer(bh); diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index f814054c8cd08a..785d9ef6f24a21 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -106,12 +106,15 @@ const struct address_space_operations gfs2_rgrp_aops = { * gfs2_getbuf - Get a buffer with a given address space * @gl: the glock * @blkno: the block number (filesystem scope) - * @create: 1 if the buffer should be created + * @fgp_flags: Flags like FGP_CREAT and FGP_NOWAIT + * + * Returns ERR_PTR(-EAGAIN) if the FGP_NOWAIT flag is set and the function + * would sleep. * * Returns: the buffer */ - -struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create) +struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, + fgf_t fgp_flags) { struct address_space *mapping = gfs2_glock2aspace(gl); struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; @@ -128,17 +131,24 @@ struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create) index = blkno >> shift; /* convert block to page */ bufnum = blkno - (index << shift); /* block buf index within page */ - if (create) { + if (fgp_flags & FGP_CREAT) { folio = __filemap_get_folio(mapping, index, - FGP_LOCK | FGP_ACCESSED | FGP_CREAT, + FGP_LOCK | FGP_ACCESSED | fgp_flags, mapping_gfp_mask(mapping) | __GFP_NOFAIL); + if (IS_ERR(folio)) + return ERR_CAST(folio); bh = folio_buffers(folio); - if (!bh) + if (!bh) { + if (fgp_flags & FGP_NOWAIT) { + bh = ERR_PTR(-EAGAIN); + goto out_unlock; + } bh = create_empty_buffers(folio, sdp->sd_sb.sb_bsize, 0); + } } else { folio = __filemap_get_folio(mapping, index, - FGP_LOCK | FGP_ACCESSED, 0); + FGP_LOCK | FGP_ACCESSED | fgp_flags, 0); if (IS_ERR(folio)) return NULL; bh = folio_buffers(folio); @@ -181,7 +191,7 @@ static void meta_prep_new(struct buffer_head *bh) struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno) { struct buffer_head *bh; - bh = gfs2_getbuf(gl, blkno, CREATE); + bh = gfs2_getbuf(gl, blkno, FGP_CREAT); meta_prep_new(bh); return bh; } @@ -258,7 +268,7 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags, return -EIO; } - *bhp = bh = gfs2_getbuf(gl, blkno, CREATE); + *bhp = bh = gfs2_getbuf(gl, blkno, FGP_CREAT); lock_buffer(bh); if (buffer_uptodate(bh)) { @@ -271,7 +281,7 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags, } if (rahead) { - bh = gfs2_getbuf(gl, blkno + 1, CREATE); + bh = gfs2_getbuf(gl, blkno + 1, FGP_CREAT); lock_buffer(bh); if (buffer_uptodate(bh)) { @@ -443,7 +453,7 @@ void gfs2_journal_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen) gfs2_ail1_wipe(sdp, bstart, blen); while (blen) { ty = REMOVE_META; - bh = gfs2_getbuf(ip->i_gl, bstart, NO_CREATE); + bh = gfs2_getbuf(ip->i_gl, bstart, 0); if (!bh && gfs2_is_jdata(ip)) { bh = gfs2_getjdatabuf(ip, bstart); ty = REMOVE_JDATA; @@ -519,7 +529,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen) if (extlen > max_ra) extlen = max_ra; - first_bh = gfs2_getbuf(gl, dblock, CREATE); + first_bh = gfs2_getbuf(gl, dblock, FGP_CREAT); if (buffer_uptodate(first_bh)) goto out; @@ -529,7 +539,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen) extlen--; while (extlen) { - bh = gfs2_getbuf(gl, dblock, CREATE); + bh = gfs2_getbuf(gl, dblock, FGP_CREAT); bh_readahead(bh, REQ_RAHEAD | REQ_META | REQ_PRIO); brelse(bh); diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h index 831d988c2ceb74..e239e410881c1b 100644 --- a/fs/gfs2/meta_io.h +++ b/fs/gfs2/meta_io.h @@ -55,7 +55,7 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags, int rahead, struct buffer_head **bhp); int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh); struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, - int create); + fgf_t fgp_flags); enum { REMOVE_JDATA = 0, REMOVE_META = 1, From 80e9596fe900336e83ef06f6eb79e214a358ee5d Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Tue, 16 Jan 2024 00:09:44 +0100 Subject: [PATCH 139/707] gfs2: Split gfs2_meta_read_async off from gfs2_meta_read Split gfs2_meta_read_async() off from gfs2_meta_read() and implement gfs2_meta_read() in terms of gfs2_meta_read_async() and gfs2_meta_wait(). The initial check for filesystem withdrawal in gfs2_meta_wait() is unnecessary and can be removed because gfs2_meta_wait() always follows gfs2_meta_read_async() in the code and gfs2_meta_read_async() already performs that check. Signed-off-by: Andreas Gruenbacher Reviewed-by: Andrew Price --- fs/gfs2/dir.c | 6 +++--- fs/gfs2/incore.h | 1 - fs/gfs2/meta_io.c | 46 ++++++++++++++++++++++------------------------ fs/gfs2/meta_io.h | 6 ++++-- fs/gfs2/quota.c | 2 +- fs/gfs2/rgrp.c | 3 ++- fs/gfs2/xattr.c | 13 ++++++------- 7 files changed, 38 insertions(+), 39 deletions(-) diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index 518a7fb42df044..446e374a8d78ca 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -105,7 +105,7 @@ static int gfs2_dir_get_existing_buffer(struct gfs2_inode *ip, u64 block, struct buffer_head *bh; int error; - error = gfs2_meta_read(ip->i_gl, block, DIO_WAIT, 0, &bh); + error = gfs2_meta_read(ip->i_gl, block, 0, &bh); if (error) return error; if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), bh, GFS2_METATYPE_JD)) { @@ -300,7 +300,7 @@ static int gfs2_dir_read_data(struct gfs2_inode *ip, __be64 *buf, BUG_ON(extlen < 1); bh = gfs2_meta_ra(ip->i_gl, dblock, extlen); } else { - error = gfs2_meta_read(ip->i_gl, dblock, DIO_WAIT, 0, &bh); + error = gfs2_meta_read(ip->i_gl, dblock, 0, &bh); if (error) goto fail; } @@ -757,7 +757,7 @@ static int get_leaf(struct gfs2_inode *dip, u64 leaf_no, { int error; - error = gfs2_meta_read(dip->i_gl, leaf_no, DIO_WAIT, 0, bhp); + error = gfs2_meta_read(dip->i_gl, leaf_no, 0, bhp); if (!error && gfs2_metatype_check(GFS2_SB(&dip->i_inode), *bhp, GFS2_METATYPE_LF)) { /* pr_info("block num=%llu\n", leaf_no); */ error = -EIO; diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 95a334d64da2a3..e1343fd0a5b108 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -22,7 +22,6 @@ #include #include -#define DIO_WAIT 0x00000010 #define DIO_METADATA 0x00000020 struct gfs2_log_operations; diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 785d9ef6f24a21..faa2bd7771060a 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -245,18 +245,17 @@ static void gfs2_submit_bhs(blk_opf_t opf, struct buffer_head *bhs[], int num) } /** - * gfs2_meta_read - Read a block from disk + * gfs2_meta_read_async - Read a block from disk * @gl: The glock covering the block * @blkno: The block number - * @flags: flags * @rahead: Do read-ahead * @bhp: the place where the buffer is returned (NULL on failure) * * Returns: errno */ -int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags, - int rahead, struct buffer_head **bhp) +int gfs2_meta_read_async(struct gfs2_glock *gl, u64 blkno, + int rahead, struct buffer_head **bhp) { struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; struct buffer_head *bh, *bhs[2]; @@ -273,7 +272,6 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags, lock_buffer(bh); if (buffer_uptodate(bh)) { unlock_buffer(bh); - flags &= ~DIO_WAIT; } else { bh->b_end_io = end_buffer_read_sync; get_bh(bh); @@ -294,20 +292,6 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags, } gfs2_submit_bhs(REQ_OP_READ | REQ_META | REQ_PRIO, bhs, num); - if (!(flags & DIO_WAIT)) - return 0; - - bh = *bhp; - wait_on_buffer(bh); - if (unlikely(!buffer_uptodate(bh))) { - struct gfs2_trans *tr = current->journal_info; - if (tr && test_bit(TR_TOUCHED, &tr->tr_flags)) - gfs2_io_error_bh_wd(sdp, bh); - brelse(bh); - *bhp = NULL; - return -EIO; - } - return 0; } @@ -321,10 +305,6 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags, int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh) { - if (gfs2_withdrawing_or_withdrawn(sdp) && - !gfs2_withdraw_in_prog(sdp)) - return -EIO; - wait_on_buffer(bh); if (!buffer_uptodate(bh)) { @@ -340,6 +320,24 @@ int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh) return 0; } +int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, + int rahead, struct buffer_head **bhp) +{ + struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; + int ret; + + ret = gfs2_meta_read_async(gl, blkno, rahead, bhp); + if (ret) + return ret; + + ret = gfs2_meta_wait(sdp, *bhp); + if (ret) { + brelse(*bhp); + *bhp = NULL; + } + return ret; +} + void gfs2_remove_from_journal(struct buffer_head *bh, int meta) { struct address_space *mapping = bh->b_folio->mapping; @@ -496,7 +494,7 @@ int gfs2_meta_buffer(struct gfs2_inode *ip, u32 mtype, u64 num, if (num == ip->i_no_addr) rahead = ip->i_rahead; - ret = gfs2_meta_read(gl, num, DIO_WAIT, rahead, &bh); + ret = gfs2_meta_read(gl, num, rahead, &bh); if (ret == 0 && gfs2_metatype_check(sdp, bh, mtype)) { brelse(bh); ret = -EIO; diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h index e239e410881c1b..f04c91eadfb4ae 100644 --- a/fs/gfs2/meta_io.h +++ b/fs/gfs2/meta_io.h @@ -51,9 +51,11 @@ static inline struct gfs2_sbd *gfs2_mapping2sbd(struct address_space *mapping) } struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno); -int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags, - int rahead, struct buffer_head **bhp); +int gfs2_meta_read_async(struct gfs2_glock *gl, u64 blkno, + int rahead, struct buffer_head **bhp); int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh); +int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, + int rahead, struct buffer_head **bhp); struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, fgf_t fgp_flags); enum { diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index aa9cf010284897..a68af8bdf7de44 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -424,7 +424,7 @@ static int bh_get(struct gfs2_quota_data *qd) goto fail; error = gfs2_meta_read(ip->i_gl, iomap.addr >> inode->i_blkbits, - DIO_WAIT, 0, &bh); + 0, &bh); if (error) goto fail; error = -EIO; diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 26d6c1eea55919..fcef82c767e346 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -1210,7 +1210,8 @@ int gfs2_rgrp_go_instantiate(struct gfs2_glock *gl) for (x = 0; x < length; x++) { bi = rgd->rd_bits + x; - error = gfs2_meta_read(gl, rgd->rd_addr + x, 0, 0, &bi->bi_bh); + error = gfs2_meta_read_async(gl, rgd->rd_addr + x, 0, + &bi->bi_bh); if (error) goto fail; } diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c index 8c96ba6230d1b9..759347ec67af5c 100644 --- a/fs/gfs2/xattr.c +++ b/fs/gfs2/xattr.c @@ -128,7 +128,7 @@ static int ea_foreach(struct gfs2_inode *ip, ea_call_t ea_call, void *data) __be64 *eablk, *end; int error; - error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, 0, &bh); + error = gfs2_meta_read(ip->i_gl, ip->i_eattr, 0, &bh); if (error) return error; @@ -152,7 +152,7 @@ static int ea_foreach(struct gfs2_inode *ip, ea_call_t ea_call, void *data) break; bn = be64_to_cpu(*eablk); - error = gfs2_meta_read(ip->i_gl, bn, DIO_WAIT, 0, &eabh); + error = gfs2_meta_read(ip->i_gl, bn, 0, &eabh); if (error) break; error = ea_foreach_i(ip, eabh, ea_call, data); @@ -468,8 +468,8 @@ static int gfs2_iter_unstuffed(struct gfs2_inode *ip, struct gfs2_ea_header *ea, return -ENOMEM; for (x = 0; x < nptrs; x++) { - error = gfs2_meta_read(ip->i_gl, be64_to_cpu(*dataptrs), 0, 0, - bh + x); + error = gfs2_meta_read_async(ip->i_gl, be64_to_cpu(*dataptrs), + 0, bh + x); if (error) { while (x--) brelse(bh[x]); @@ -976,8 +976,7 @@ static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er, if (ip->i_diskflags & GFS2_DIF_EA_INDIRECT) { __be64 *end; - error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, 0, - &indbh); + error = gfs2_meta_read(ip->i_gl, ip->i_eattr, 0, &indbh); if (error) return error; @@ -1279,7 +1278,7 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip) memset(&rlist, 0, sizeof(struct gfs2_rgrp_list)); - error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, 0, &indbh); + error = gfs2_meta_read(ip->i_gl, ip->i_eattr, 0, &indbh); if (error) return error; From 224e701e74ec6cd44f6f80352bbfef03e406720b Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Tue, 16 Jan 2024 14:22:21 +0100 Subject: [PATCH 140/707] gfs2: Add FGP_NOWAIT support to gfs2_meta_read_async Add an fgp_flags argument to gfs2_meta_read_async() and gfs2_meta_read() that is passed through to gfs2_getbuf(). When the FGP_NOWAIT flag is set and gfs2_meta_read_async() would sleep, -EAGAIN is returned instead. Signed-off-by: Andreas Gruenbacher Reviewed-by: Andrew Price --- fs/gfs2/dir.c | 6 +++--- fs/gfs2/meta_io.c | 41 ++++++++++++++++++++++++++++++++++------- fs/gfs2/meta_io.h | 4 ++-- fs/gfs2/quota.c | 2 +- fs/gfs2/rgrp.c | 2 +- fs/gfs2/xattr.c | 10 +++++----- 6 files changed, 46 insertions(+), 19 deletions(-) diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index 446e374a8d78ca..a1f47696a2ecfb 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -105,7 +105,7 @@ static int gfs2_dir_get_existing_buffer(struct gfs2_inode *ip, u64 block, struct buffer_head *bh; int error; - error = gfs2_meta_read(ip->i_gl, block, 0, &bh); + error = gfs2_meta_read(ip->i_gl, block, 0, 0, &bh); if (error) return error; if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), bh, GFS2_METATYPE_JD)) { @@ -300,7 +300,7 @@ static int gfs2_dir_read_data(struct gfs2_inode *ip, __be64 *buf, BUG_ON(extlen < 1); bh = gfs2_meta_ra(ip->i_gl, dblock, extlen); } else { - error = gfs2_meta_read(ip->i_gl, dblock, 0, &bh); + error = gfs2_meta_read(ip->i_gl, dblock, 0, 0, &bh); if (error) goto fail; } @@ -757,7 +757,7 @@ static int get_leaf(struct gfs2_inode *dip, u64 leaf_no, { int error; - error = gfs2_meta_read(dip->i_gl, leaf_no, 0, bhp); + error = gfs2_meta_read(dip->i_gl, leaf_no, 0, 0, bhp); if (!error && gfs2_metatype_check(GFS2_SB(&dip->i_inode), *bhp, GFS2_METATYPE_LF)) { /* pr_info("block num=%llu\n", leaf_no); */ error = -EIO; diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index faa2bd7771060a..0827e1f58a5a03 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -248,13 +248,16 @@ static void gfs2_submit_bhs(blk_opf_t opf, struct buffer_head *bhs[], int num) * gfs2_meta_read_async - Read a block from disk * @gl: The glock covering the block * @blkno: The block number + * @fgp_flags: FGP_NOWAIT if sleeping is prohibited * @rahead: Do read-ahead * @bhp: the place where the buffer is returned (NULL on failure) * + * Returns -EAGAIN if the FGP_NOWAIT flag is set and the function would sleep. + * * Returns: errno */ -int gfs2_meta_read_async(struct gfs2_glock *gl, u64 blkno, +int gfs2_meta_read_async(struct gfs2_glock *gl, u64 blkno, fgf_t fgp_flags, int rahead, struct buffer_head **bhp) { struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; @@ -267,7 +270,28 @@ int gfs2_meta_read_async(struct gfs2_glock *gl, u64 blkno, return -EIO; } - *bhp = bh = gfs2_getbuf(gl, blkno, FGP_CREAT); + *bhp = bh = gfs2_getbuf(gl, blkno, FGP_CREAT | fgp_flags); + if (IS_ERR(bh)) { + *bhp = NULL; + return PTR_ERR(bh); + } + + if (fgp_flags & FGP_NOWAIT) { + bool uptodate = false; + + /* skips readahead entirely. */ + + if (trylock_buffer(bh)) { + uptodate = buffer_uptodate(bh); + unlock_buffer(bh); + } + if (!uptodate) { + brelse(bh); + *bhp = NULL; + return -EAGAIN; + } + return 0; + } lock_buffer(bh); if (buffer_uptodate(bh)) { @@ -279,7 +303,9 @@ int gfs2_meta_read_async(struct gfs2_glock *gl, u64 blkno, } if (rahead) { - bh = gfs2_getbuf(gl, blkno + 1, FGP_CREAT); + bh = gfs2_getbuf(gl, blkno + 1, FGP_CREAT | fgp_flags); + if (IS_ERR(bh)) + goto out; lock_buffer(bh); if (buffer_uptodate(bh)) { @@ -291,6 +317,7 @@ int gfs2_meta_read_async(struct gfs2_glock *gl, u64 blkno, } } +out: gfs2_submit_bhs(REQ_OP_READ | REQ_META | REQ_PRIO, bhs, num); return 0; } @@ -320,14 +347,14 @@ int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh) return 0; } -int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, +int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, fgf_t fgp_flags, int rahead, struct buffer_head **bhp) { struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; int ret; - ret = gfs2_meta_read_async(gl, blkno, rahead, bhp); - if (ret) + ret = gfs2_meta_read_async(gl, blkno, fgp_flags, rahead, bhp); + if (ret || (fgp_flags & FGP_NOWAIT)) return ret; ret = gfs2_meta_wait(sdp, *bhp); @@ -494,7 +521,7 @@ int gfs2_meta_buffer(struct gfs2_inode *ip, u32 mtype, u64 num, if (num == ip->i_no_addr) rahead = ip->i_rahead; - ret = gfs2_meta_read(gl, num, rahead, &bh); + ret = gfs2_meta_read(gl, num, 0, rahead, &bh); if (ret == 0 && gfs2_metatype_check(sdp, bh, mtype)) { brelse(bh); ret = -EIO; diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h index f04c91eadfb4ae..6ca37e9f1c955e 100644 --- a/fs/gfs2/meta_io.h +++ b/fs/gfs2/meta_io.h @@ -51,10 +51,10 @@ static inline struct gfs2_sbd *gfs2_mapping2sbd(struct address_space *mapping) } struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno); -int gfs2_meta_read_async(struct gfs2_glock *gl, u64 blkno, +int gfs2_meta_read_async(struct gfs2_glock *gl, u64 blkno, fgf_t fgp_flags, int rahead, struct buffer_head **bhp); int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh); -int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, +int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, fgf_t fgp_flags, int rahead, struct buffer_head **bhp); struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, fgf_t fgp_flags); diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index a68af8bdf7de44..8b47306816ab74 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -424,7 +424,7 @@ static int bh_get(struct gfs2_quota_data *qd) goto fail; error = gfs2_meta_read(ip->i_gl, iomap.addr >> inode->i_blkbits, - 0, &bh); + 0, 0, &bh); if (error) goto fail; error = -EIO; diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index fcef82c767e346..9fa4fc7f4bcfe9 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -1210,7 +1210,7 @@ int gfs2_rgrp_go_instantiate(struct gfs2_glock *gl) for (x = 0; x < length; x++) { bi = rgd->rd_bits + x; - error = gfs2_meta_read_async(gl, rgd->rd_addr + x, 0, + error = gfs2_meta_read_async(gl, rgd->rd_addr + x, 0, 0, &bi->bi_bh); if (error) goto fail; diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c index 759347ec67af5c..745c7cf7851918 100644 --- a/fs/gfs2/xattr.c +++ b/fs/gfs2/xattr.c @@ -128,7 +128,7 @@ static int ea_foreach(struct gfs2_inode *ip, ea_call_t ea_call, void *data) __be64 *eablk, *end; int error; - error = gfs2_meta_read(ip->i_gl, ip->i_eattr, 0, &bh); + error = gfs2_meta_read(ip->i_gl, ip->i_eattr, 0, 0, &bh); if (error) return error; @@ -152,7 +152,7 @@ static int ea_foreach(struct gfs2_inode *ip, ea_call_t ea_call, void *data) break; bn = be64_to_cpu(*eablk); - error = gfs2_meta_read(ip->i_gl, bn, 0, &eabh); + error = gfs2_meta_read(ip->i_gl, bn, 0, 0, &eabh); if (error) break; error = ea_foreach_i(ip, eabh, ea_call, data); @@ -469,7 +469,7 @@ static int gfs2_iter_unstuffed(struct gfs2_inode *ip, struct gfs2_ea_header *ea, for (x = 0; x < nptrs; x++) { error = gfs2_meta_read_async(ip->i_gl, be64_to_cpu(*dataptrs), - 0, bh + x); + 0, 0, bh + x); if (error) { while (x--) brelse(bh[x]); @@ -976,7 +976,7 @@ static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er, if (ip->i_diskflags & GFS2_DIF_EA_INDIRECT) { __be64 *end; - error = gfs2_meta_read(ip->i_gl, ip->i_eattr, 0, &indbh); + error = gfs2_meta_read(ip->i_gl, ip->i_eattr, 0, 0, &indbh); if (error) return error; @@ -1278,7 +1278,7 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip) memset(&rlist, 0, sizeof(struct gfs2_rgrp_list)); - error = gfs2_meta_read(ip->i_gl, ip->i_eattr, 0, &indbh); + error = gfs2_meta_read(ip->i_gl, ip->i_eattr, 0, 0, &indbh); if (error) return error; From add350d5501e99ef7229d08e47f2790d578ead1d Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Tue, 16 Jan 2024 15:00:16 +0100 Subject: [PATCH 141/707] gfs2: Pass FGP flags to gfs2_meta_{,inode_}buffer Pass a fgp_flags argument to gfs2_meta_buffer() and gfs2_meta_inode_buffer(). If the FGP_NOWAIT flag is set and one of these functions would sleep, -EAGAIN is returned instead. Signed-off-by: Andreas Gruenbacher Reviewed-by: Andrew Price --- fs/gfs2/aops.c | 4 ++-- fs/gfs2/bmap.c | 19 ++++++++++--------- fs/gfs2/dir.c | 20 ++++++++++---------- fs/gfs2/file.c | 4 ++-- fs/gfs2/glops.c | 2 +- fs/gfs2/inode.c | 4 ++-- fs/gfs2/meta_io.c | 7 +++++-- fs/gfs2/meta_io.h | 7 ++++--- fs/gfs2/ops_fstype.c | 2 +- fs/gfs2/recovery.c | 2 +- fs/gfs2/rgrp.c | 2 +- fs/gfs2/super.c | 6 +++--- fs/gfs2/xattr.c | 4 ++-- 13 files changed, 44 insertions(+), 39 deletions(-) diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 974aca9c8ea84f..31da7f6926c966 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -424,7 +424,7 @@ static int stuffed_read_folio(struct gfs2_inode *ip, struct folio *folio) if (unlikely(folio->index)) { dsize = 0; } else { - error = gfs2_meta_inode_buffer(ip, &dibh); + error = gfs2_meta_inode_buffer(ip, 0, &dibh); if (error) goto out; from = dibh->b_data + sizeof(struct gfs2_dinode); @@ -548,7 +548,7 @@ void adjust_fs_space(struct inode *inode) /* Total up the file system space, according to the latest rindex. */ fs_total = gfs2_ri_total(sdp); - if (gfs2_meta_inode_buffer(m_ip, &m_bh) != 0) + if (gfs2_meta_inode_buffer(m_ip, 0, &m_bh) != 0) goto out; spin_lock(&sdp->sd_statfs_spin); diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 92945e5b764348..ffe19d3012f5f0 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -95,7 +95,7 @@ static int __gfs2_unstuff_inode(struct gfs2_inode *ip, struct folio *folio) int isdir = gfs2_is_dir(ip); int error; - error = gfs2_meta_inode_buffer(ip, &dibh); + error = gfs2_meta_inode_buffer(ip, 0, &dibh); if (error) return error; @@ -331,7 +331,8 @@ static int __fillup_metapath(struct gfs2_inode *ip, struct metapath *mp, if (!dblock) break; - ret = gfs2_meta_buffer(ip, GFS2_METATYPE_IN, dblock, &mp->mp_bh[x + 1]); + ret = gfs2_meta_buffer(ip, GFS2_METATYPE_IN, dblock, 0, + &mp->mp_bh[x + 1]); if (ret) return ret; } @@ -858,7 +859,7 @@ static int __gfs2_iomap_get(struct inode *inode, loff_t pos, loff_t length, down_read(&ip->i_rw_mutex); - ret = gfs2_meta_inode_buffer(ip, &dibh); + ret = gfs2_meta_inode_buffer(ip, 0, &dibh); if (ret) goto unlock; mp->mp_bh[0] = dibh; @@ -1377,7 +1378,7 @@ static int trunc_start(struct inode *inode, u64 newsize) if (error) return error; - error = gfs2_meta_inode_buffer(ip, &dibh); + error = gfs2_meta_inode_buffer(ip, 0, &dibh); if (error) goto out; @@ -1580,7 +1581,7 @@ static int sweep_bh_for_rgrps(struct gfs2_inode *ip, struct gfs2_holder *rd_gh, if (current->journal_info) { struct buffer_head *dibh; - ret = gfs2_meta_inode_buffer(ip, &dibh); + ret = gfs2_meta_inode_buffer(ip, 0, &dibh); if (ret) goto out; @@ -1785,7 +1786,7 @@ static int punch_hole(struct gfs2_inode *ip, u64 offset, u64 length) } start_aligned = mp_h; - ret = gfs2_meta_inode_buffer(ip, &dibh); + ret = gfs2_meta_inode_buffer(ip, 0, &dibh); if (ret) return ret; @@ -1985,7 +1986,7 @@ static int trunc_end(struct gfs2_inode *ip) down_write(&ip->i_rw_mutex); - error = gfs2_meta_inode_buffer(ip, &dibh); + error = gfs2_meta_inode_buffer(ip, 0, &dibh); if (error) goto out; @@ -2091,7 +2092,7 @@ static int do_grow(struct inode *inode, u64 size) goto do_end_trans; } - error = gfs2_meta_inode_buffer(ip, &dibh); + error = gfs2_meta_inode_buffer(ip, 0, &dibh); if (error) goto do_end_trans; @@ -2345,7 +2346,7 @@ static int stuffed_zero_range(struct inode *inode, loff_t offset, loff_t length) if (offset + length > inode->i_size) length = inode->i_size - offset; - error = gfs2_meta_inode_buffer(ip, &dibh); + error = gfs2_meta_inode_buffer(ip, 0, &dibh); if (error) return error; gfs2_trans_add_meta(ip->i_gl, dibh); diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index a1f47696a2ecfb..8719c8316e2801 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -122,7 +122,7 @@ static int gfs2_dir_write_stuffed(struct gfs2_inode *ip, const char *buf, struct buffer_head *dibh; int error; - error = gfs2_meta_inode_buffer(ip, &dibh); + error = gfs2_meta_inode_buffer(ip, 0, &dibh); if (error) return error; @@ -221,7 +221,7 @@ static int gfs2_dir_write_data(struct gfs2_inode *ip, const char *buf, } out: - error = gfs2_meta_inode_buffer(ip, &dibh); + error = gfs2_meta_inode_buffer(ip, 0, &dibh); if (error) return error; @@ -246,7 +246,7 @@ static int gfs2_dir_read_stuffed(struct gfs2_inode *ip, __be64 *buf, struct buffer_head *dibh; int error; - error = gfs2_meta_inode_buffer(ip, &dibh); + error = gfs2_meta_inode_buffer(ip, 0, &dibh); if (!error) { memcpy(buf, dibh->b_data + sizeof(struct gfs2_dinode), size); brelse(dibh); @@ -844,7 +844,7 @@ static struct gfs2_dirent *gfs2_dirent_search(struct inode *inode, } - error = gfs2_meta_inode_buffer(ip, &bh); + error = gfs2_meta_inode_buffer(ip, 0, &bh); if (error) return ERR_PTR(error); dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size, scan, name, NULL); @@ -915,7 +915,7 @@ static int dir_make_exhash(struct inode *inode) u64 bn; int error; - error = gfs2_meta_inode_buffer(dip, &dibh); + error = gfs2_meta_inode_buffer(dip, 0, &dibh); if (error) return error; @@ -1115,7 +1115,7 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name) oleaf->lf_depth = nleaf->lf_depth; - error = gfs2_meta_inode_buffer(dip, &dibh); + error = gfs2_meta_inode_buffer(dip, 0, &dibh); if (!gfs2_assert_withdraw(GFS2_SB(&dip->i_inode), !error)) { gfs2_trans_add_meta(dip->i_gl, dibh); gfs2_add_inode_blocks(&dip->i_inode, 1); @@ -1169,7 +1169,7 @@ static int dir_double_exhash(struct gfs2_inode *dip) return -ENOMEM; h = hc2; - error = gfs2_meta_inode_buffer(dip, &dibh); + error = gfs2_meta_inode_buffer(dip, 0, &dibh); if (error) goto out_kfree; @@ -1586,7 +1586,7 @@ int gfs2_dir_read(struct inode *inode, struct dir_context *ctx, return -EIO; } - error = gfs2_meta_inode_buffer(dip, &dibh); + error = gfs2_meta_inode_buffer(dip, 0, &dibh); if (error) return error; @@ -1758,7 +1758,7 @@ static int dir_new_leaf(struct inode *inode, const struct qstr *name) brelse(bh); brelse(obh); - error = gfs2_meta_inode_buffer(ip, &bh); + error = gfs2_meta_inode_buffer(ip, 0, &bh); if (error) return error; gfs2_trans_add_meta(ip->i_gl, bh); @@ -2063,7 +2063,7 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len, goto out_end_trans; } - error = gfs2_meta_inode_buffer(dip, &dibh); + error = gfs2_meta_inode_buffer(dip, 0, &dibh); if (error) goto out_end_trans; diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 992ca4effb505e..ac4ed6abfc6c24 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -257,7 +257,7 @@ static int do_gfs2_set_flags(struct inode *inode, u32 reqflags, u32 mask) error = gfs2_trans_begin(sdp, RES_DINODE, 0); if (error) goto out; - error = gfs2_meta_inode_buffer(ip, &bh); + error = gfs2_meta_inode_buffer(ip, 0, &bh); if (error) goto out_trans_end; inode_set_ctime_current(inode); @@ -1179,7 +1179,7 @@ static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len, struct buffer_head *dibh; int error; - error = gfs2_meta_inode_buffer(ip, &dibh); + error = gfs2_meta_inode_buffer(ip, 0, &dibh); if (unlikely(error)) return error; diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 45653cbc8a87d1..cc38aa81d2467d 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -483,7 +483,7 @@ int gfs2_inode_refresh(struct gfs2_inode *ip) struct buffer_head *dibh; int error; - error = gfs2_meta_inode_buffer(ip, &dibh); + error = gfs2_meta_inode_buffer(ip, 0, &dibh); if (error) return error; diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 6bfc9383b7b8ec..a417650e378f96 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -1027,7 +1027,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir, goto out_ipres; } - error = gfs2_meta_inode_buffer(ip, &dibh); + error = gfs2_meta_inode_buffer(ip, 0, &dibh); if (error) goto out_end_trans; @@ -1833,7 +1833,7 @@ static const char *gfs2_get_link(struct dentry *dentry, goto out; } - error = gfs2_meta_inode_buffer(ip, &dibh); + error = gfs2_meta_inode_buffer(ip, 0, &dibh); if (error) { buf = ERR_PTR(error); goto out; diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 0827e1f58a5a03..c3c04898971f9c 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -504,13 +504,16 @@ void gfs2_journal_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen) * @ip: The GFS2 inode * @mtype: The block type (GFS2_METATYPE_*) * @num: The block number (device relative) of the buffer + * @fgp_flags: FGP_NOWAIT if sleeping is prohibited * @bhp: the buffer is returned here * + * Returns -EAGAIN if the FGP_NOWAIT flag is set and the function would sleep. + * * Returns: errno */ int gfs2_meta_buffer(struct gfs2_inode *ip, u32 mtype, u64 num, - struct buffer_head **bhp) + fgf_t fgp_flags, struct buffer_head **bhp) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_glock *gl = ip->i_gl; @@ -521,7 +524,7 @@ int gfs2_meta_buffer(struct gfs2_inode *ip, u32 mtype, u64 num, if (num == ip->i_no_addr) rahead = ip->i_rahead; - ret = gfs2_meta_read(gl, num, 0, rahead, &bh); + ret = gfs2_meta_read(gl, num, fgp_flags, rahead, &bh); if (ret == 0 && gfs2_metatype_check(sdp, bh, mtype)) { brelse(bh); ret = -EIO; diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h index 6ca37e9f1c955e..7b51220781138e 100644 --- a/fs/gfs2/meta_io.h +++ b/fs/gfs2/meta_io.h @@ -66,12 +66,13 @@ enum { void gfs2_remove_from_journal(struct buffer_head *bh, int meta); void gfs2_journal_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen); int gfs2_meta_buffer(struct gfs2_inode *ip, u32 mtype, u64 num, - struct buffer_head **bhp); + fgf_t fgp_flags, struct buffer_head **bhp); -static inline int gfs2_meta_inode_buffer(struct gfs2_inode *ip, +static inline int gfs2_meta_inode_buffer(struct gfs2_inode *ip, fgf_t fgp_flags, struct buffer_head **bhp) { - return gfs2_meta_buffer(ip, GFS2_METATYPE_DI, ip->i_no_addr, bhp); + return gfs2_meta_buffer(ip, GFS2_METATYPE_DI, ip->i_no_addr, fgp_flags, + bhp); } struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen); diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 1281e60be63900..1a01adb48f5bbb 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -698,7 +698,7 @@ static int init_statfs(struct gfs2_sbd *sdp) goto free_local; } /* read in the local statfs buffer - other nodes don't change it. */ - error = gfs2_meta_inode_buffer(ip, &sdp->sd_sc_bh); + error = gfs2_meta_inode_buffer(ip, 0, &sdp->sd_sc_bh); if (error) { fs_err(sdp, "Cannot read in local statfs: %d\n", error); goto unlock_sd_gh; diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c index f4fe7039f725b0..4db3ca9c3b0270 100644 --- a/fs/gfs2/recovery.c +++ b/fs/gfs2/recovery.c @@ -317,7 +317,7 @@ static int update_statfs_inode(struct gfs2_jdesc *jd, BUG_ON(!inode); ip = GFS2_I(inode); - error = gfs2_meta_inode_buffer(ip, &bh); + error = gfs2_meta_inode_buffer(ip, 0, &bh); if (error) goto out; diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 9fa4fc7f4bcfe9..c9aec3dba80c6d 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -2452,7 +2452,7 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks, rbm.rgd->rd_last_alloc = block - rbm.rgd->rd_data0; if (!dinode) { ip->i_goal = block + *nblocks - 1; - error = gfs2_meta_inode_buffer(ip, &dibh); + error = gfs2_meta_inode_buffer(ip, 0, &dibh); if (error == 0) { struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data; diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index e5f79466340d2a..7bfdd5924a5457 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -192,7 +192,7 @@ int gfs2_statfs_init(struct gfs2_sbd *sdp) if (error) return error; - error = gfs2_meta_inode_buffer(m_ip, &m_bh); + error = gfs2_meta_inode_buffer(m_ip, 0, &m_bh); if (error) goto out; @@ -282,7 +282,7 @@ int gfs2_statfs_sync(struct super_block *sb, int type) if (error) goto out; - error = gfs2_meta_inode_buffer(m_ip, &m_bh); + error = gfs2_meta_inode_buffer(m_ip, 0, &m_bh); if (error) goto out_unlock; @@ -521,7 +521,7 @@ static void gfs2_dirty_inode(struct inode *inode, int flags) need_endtrans = 1; } - ret = gfs2_meta_inode_buffer(ip, &bh); + ret = gfs2_meta_inode_buffer(ip, 0, &bh); if (ret == 0) { gfs2_trans_add_meta(ip->i_gl, bh); gfs2_dinode_out(ip, bh->b_data); diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c index 745c7cf7851918..e2ff0fc8e4459a 100644 --- a/fs/gfs2/xattr.c +++ b/fs/gfs2/xattr.c @@ -1360,7 +1360,7 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip) ip->i_diskflags &= ~GFS2_DIF_EA_INDIRECT; - error = gfs2_meta_inode_buffer(ip, &dibh); + error = gfs2_meta_inode_buffer(ip, 0, &dibh); if (!error) { gfs2_trans_add_meta(ip->i_gl, dibh); gfs2_dinode_out(ip, dibh->b_data); @@ -1412,7 +1412,7 @@ static int ea_dealloc_block(struct gfs2_inode *ip) gfs2_add_inode_blocks(&ip->i_inode, -1); if (likely(!test_bit(GIF_ALLOC_FAILED, &ip->i_flags))) { - error = gfs2_meta_inode_buffer(ip, &dibh); + error = gfs2_meta_inode_buffer(ip, 0, &dibh); if (!error) { gfs2_trans_add_meta(ip->i_gl, dibh); gfs2_dinode_out(ip, dibh->b_data); From 2afca0b0afc4a300fc5556c6d75c6e641493342f Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Tue, 16 Jan 2024 21:44:31 +0100 Subject: [PATCH 142/707] gfs2: Pass FGP flags to gfs2_dirent_search Add an fgp_flags argument to gfs2_dirent_search(). When the FGP_NOWAIT flag is set and gfs2_dirent_search() would sleep, -EAGAIN is returned instead. Signed-off-by: Andreas Gruenbacher Reviewed-by: Andrew Price --- fs/gfs2/dir.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index 8719c8316e2801..81d1ea61f44f3e 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -805,6 +805,7 @@ static int get_first_leaf(struct gfs2_inode *dip, u32 index, static struct gfs2_dirent *gfs2_dirent_search(struct inode *inode, const struct qstr *name, gfs2_dscan_t scan, + fgf_t fgp_flags, struct buffer_head **pbh) { struct buffer_head *bh; @@ -817,6 +818,11 @@ static struct gfs2_dirent *gfs2_dirent_search(struct inode *inode, unsigned int hsize = BIT(ip->i_depth); unsigned int index; u64 ln; + + /* no lockless lookup inside ExHash directories */ + if (fgp_flags & FGP_NOWAIT) + return ERR_PTR(-EAGAIN); + if (hsize * sizeof(u64) != i_size_read(inode)) { gfs2_consist_inode(ip); return ERR_PTR(-EIO); @@ -843,8 +849,7 @@ static struct gfs2_dirent *gfs2_dirent_search(struct inode *inode, return error ? ERR_PTR(error) : NULL; } - - error = gfs2_meta_inode_buffer(ip, 0, &bh); + error = gfs2_meta_inode_buffer(ip, fgp_flags, &bh); if (error) return ERR_PTR(error); dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size, scan, name, NULL); @@ -1647,7 +1652,7 @@ struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *name, u64 addr, formal_ino; u16 dtype; - dent = gfs2_dirent_search(dir, name, gfs2_dirent_find, &bh); + dent = gfs2_dirent_search(dir, name, gfs2_dirent_find, 0, &bh); if (dent) { struct inode *inode; u16 rahead; @@ -1677,7 +1682,7 @@ int gfs2_dir_check(struct inode *dir, const struct qstr *name, struct gfs2_dirent *dent; int ret = -ENOENT; - dent = gfs2_dirent_search(dir, name, gfs2_dirent_find, &bh); + dent = gfs2_dirent_search(dir, name, gfs2_dirent_find, 0, &bh); if (dent) { if (IS_ERR(dent)) return PTR_ERR(dent); @@ -1805,7 +1810,8 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name, while(1) { if (da->bh == NULL) { dent = gfs2_dirent_search(inode, name, - gfs2_dirent_find_space, &bh); + gfs2_dirent_find_space, 0, + &bh); } if (dent) { if (IS_ERR(dent)) @@ -1880,7 +1886,8 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry) /* Returns _either_ the entry (if its first in block) or the previous entry otherwise */ - dent = gfs2_dirent_search(&dip->i_inode, name, gfs2_dirent_prev, &bh); + dent = gfs2_dirent_search(&dip->i_inode, name, gfs2_dirent_prev, 0, + &bh); if (!dent) { gfs2_consist_inode(dip); return -EIO; @@ -1939,7 +1946,8 @@ int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename, struct buffer_head *bh; struct gfs2_dirent *dent; - dent = gfs2_dirent_search(&dip->i_inode, filename, gfs2_dirent_find, &bh); + dent = gfs2_dirent_search(&dip->i_inode, filename, gfs2_dirent_find, 0, + &bh); if (!dent) { gfs2_consist_inode(dip); return -EIO; @@ -2166,7 +2174,7 @@ int gfs2_diradd_alloc_required(struct inode *inode, const struct qstr *name, da->bh = NULL; da->dent = NULL; - dent = gfs2_dirent_search(inode, name, gfs2_dirent_find_space, &bh); + dent = gfs2_dirent_search(inode, name, gfs2_dirent_find_space, 0, &bh); if (!dent) { da->nr_blocks = sdp->sd_max_dirres; if (!(ip->i_diskflags & GFS2_DIF_EXHASH) && From 77c59cfc8ac1d696cd48e8ac438f65a117399bd4 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Tue, 16 Jan 2024 23:10:08 +0100 Subject: [PATCH 143/707] gfs2: Pass FGP flags to gfs2_dir_check Pass a fgp_flags argument to gfs2_dir_check(). If the FGP_NOWAIT flag is set and gfs2_dir_check() would sleep, -EAGAIN is returned instead. Signed-off-by: Andreas Gruenbacher Reviewed-by: Andrew Price --- fs/gfs2/dentry.c | 2 +- fs/gfs2/dir.c | 4 ++-- fs/gfs2/dir.h | 2 +- fs/gfs2/inode.c | 6 +++--- fs/gfs2/ops_fstype.c | 2 +- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c index c6483fb9862450..f179a6d94cbefe 100644 --- a/fs/gfs2/dentry.c +++ b/fs/gfs2/dentry.c @@ -72,7 +72,7 @@ static int gfs2_drevalidate(struct dentry *dentry, unsigned int flags) goto out; } - error = gfs2_dir_check(dinode, &dentry->d_name, ip); + error = gfs2_dir_check(dinode, &dentry->d_name, ip, 0); valid = inode ? !error : (error == -ENOENT); if (!had_lock) diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index 81d1ea61f44f3e..d3a9c9094db085 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -1676,13 +1676,13 @@ struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *name, } int gfs2_dir_check(struct inode *dir, const struct qstr *name, - const struct gfs2_inode *ip) + const struct gfs2_inode *ip, fgf_t fgp_flags) { struct buffer_head *bh; struct gfs2_dirent *dent; int ret = -ENOENT; - dent = gfs2_dirent_search(dir, name, gfs2_dirent_find, 0, &bh); + dent = gfs2_dirent_search(dir, name, gfs2_dirent_find, fgp_flags, &bh); if (dent) { if (IS_ERR(dent)) return PTR_ERR(dent); diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h index 25a857c78b538b..cfec869a11c8cc 100644 --- a/fs/gfs2/dir.h +++ b/fs/gfs2/dir.h @@ -27,7 +27,7 @@ struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *filename, bool fail_on_exist); int gfs2_dir_check(struct inode *dir, const struct qstr *filename, - const struct gfs2_inode *ip); + const struct gfs2_inode *ip, fgf_t fgp_flags); int gfs2_dir_add(struct inode *inode, const struct qstr *filename, const struct gfs2_inode *ip, struct gfs2_diradd *da); static inline void gfs2_dir_no_add(struct gfs2_diradd *da) diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index a417650e378f96..948f5c203a3860 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -980,7 +980,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir, if (error) goto out_gunlock; - error = gfs2_dir_check(dir, &dentry->d_name, NULL); + error = gfs2_dir_check(dir, &dentry->d_name, NULL, 0); switch (error) { case -ENOENT: break; @@ -1096,7 +1096,7 @@ static int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name, if (error) return error; - return gfs2_dir_check(&dip->i_inode, name, ip); + return gfs2_dir_check(&dip->i_inode, name, ip, 0); } /** @@ -1522,7 +1522,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, if (error) goto out_gunlock; - error = gfs2_dir_check(ndir, &ndentry->d_name, NULL); + error = gfs2_dir_check(ndir, &ndentry->d_name, NULL, 0); switch (error) { case -ENOENT: error = 0; diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 1a01adb48f5bbb..f6c80ff618cc5d 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -584,7 +584,7 @@ static int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh) name.len = sprintf(buf, "journal%u", sdp->sd_journals); name.hash = gfs2_disk_hash(name.name, name.len); - error = gfs2_dir_check(sdp->sd_jindex, &name, NULL); + error = gfs2_dir_check(sdp->sd_jindex, &name, NULL, 0); if (error == -ENOENT) { error = 0; break; From 949eda30775ddda2704ad72c999e6d4112957fb3 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Wed, 17 Jan 2024 13:49:19 +0100 Subject: [PATCH 144/707] gfs2: Minor gfs2_drevalidate cleanup Get rid of the had_lock and valid variables in gfs2_drevalidate(). Signed-off-by: Andreas Gruenbacher Reviewed-by: Andrew Price --- fs/gfs2/dentry.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c index f179a6d94cbefe..99239d80982b58 100644 --- a/fs/gfs2/dentry.c +++ b/fs/gfs2/dentry.c @@ -38,8 +38,9 @@ static int gfs2_drevalidate(struct dentry *dentry, unsigned int flags) struct inode *dinode, *inode; struct gfs2_holder d_gh; struct gfs2_inode *ip = NULL; - int error, valid = 0; - int had_lock = 0; + int error; + + gfs2_holder_mark_uninitialized(&d_gh); if (flags & LOOKUP_RCU) { dinode = d_inode_rcu(READ_ONCE(dentry->d_parent)); @@ -54,18 +55,19 @@ static int gfs2_drevalidate(struct dentry *dentry, unsigned int flags) inode = d_inode(dentry); if (inode) { - if (is_bad_inode(inode)) + if (is_bad_inode(inode)) { + error = 0; goto out; + } ip = GFS2_I(inode); } if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL) { - valid = 1; + error = 1; goto out; } - had_lock = (gfs2_glock_is_locked_by_me(dip->i_gl) != NULL); - if (!had_lock) { + if (!gfs2_glock_is_locked_by_me(dip->i_gl)) { error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, flags & LOOKUP_RCU ? GL_NOBLOCK : 0, &d_gh); if (error) @@ -73,13 +75,13 @@ static int gfs2_drevalidate(struct dentry *dentry, unsigned int flags) } error = gfs2_dir_check(dinode, &dentry->d_name, ip, 0); - valid = inode ? !error : (error == -ENOENT); + error = inode ? !error : (error == -ENOENT); - if (!had_lock) + if (gfs2_holder_initialized(&d_gh)) gfs2_glock_dq_uninit(&d_gh); out: dput(parent); - return valid; + return error; } static int gfs2_dhash(const struct dentry *dentry, struct qstr *str) From acd2d246f4b26460d0499bc4e0042f63380e526b Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Tue, 16 Jan 2024 23:15:18 +0100 Subject: [PATCH 145/707] gfs2: Fix LOOKUP_RCU support in gfs2_drevalidate Fix non-sleeping lookups in gfs2_drevalidate() by passing the FGP_NOWAIT flag down to gfs2_dir_check(). This will cause gfs2_dir_check() to return -EAGAIN when it would otherwise sleep. Signed-off-by: Andreas Gruenbacher Reviewed-by: Andrew Price --- fs/gfs2/dentry.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c index 99239d80982b58..b14956cdab0ec2 100644 --- a/fs/gfs2/dentry.c +++ b/fs/gfs2/dentry.c @@ -74,12 +74,20 @@ static int gfs2_drevalidate(struct dentry *dentry, unsigned int flags) goto out; } - error = gfs2_dir_check(dinode, &dentry->d_name, ip, 0); + if (flags & LOOKUP_RCU) { + error = gfs2_dir_check(dinode, &dentry->d_name, ip, FGP_NOWAIT); + if (error == -EAGAIN) { + error = -ECHILD; + goto out; + } + } else { + error = gfs2_dir_check(dinode, &dentry->d_name, ip, 0); + } error = inode ? !error : (error == -ENOENT); +out: if (gfs2_holder_initialized(&d_gh)) gfs2_glock_dq_uninit(&d_gh); -out: dput(parent); return error; } From 189a4edb774b9ee5daf345e703e53fffaf509285 Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Mon, 22 Jan 2024 16:49:34 -0800 Subject: [PATCH 146/707] lkdtm: Make lkdtm_do_action() return to avoid tail call optimization The comments for lkdtm_do_action() explicitly call out that it shouldn't be inlined because we want it to show up in stack crawls. However, at least with some compilers / options it's still vanishing due to tail call optimization. Let's add a return value to the function to make it harder for the compiler to do tail call optimization here. Now that we have a return value, we can actually use it in the callers, which is a minor improvement in the code. Signed-off-by: Douglas Anderson Link: https://lore.kernel.org/r/20240122164935.1.I345e485f36babad76370c59659a706723750d950@changeid Signed-off-by: Kees Cook --- drivers/misc/lkdtm/core.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/drivers/misc/lkdtm/core.c b/drivers/misc/lkdtm/core.c index 0772e4a4757e9f..5732fd59a227d0 100644 --- a/drivers/misc/lkdtm/core.c +++ b/drivers/misc/lkdtm/core.c @@ -153,12 +153,17 @@ static const struct crashtype *find_crashtype(const char *name) /* * This is forced noinline just so it distinctly shows up in the stackdump * which makes validation of expected lkdtm crashes easier. + * + * NOTE: having a valid return value helps prevent the compiler from doing + * tail call optimizations and taking this out of the stack trace. */ -static noinline void lkdtm_do_action(const struct crashtype *crashtype) +static noinline int lkdtm_do_action(const struct crashtype *crashtype) { if (WARN_ON(!crashtype || !crashtype->func)) - return; + return -EINVAL; crashtype->func(); + + return 0; } static int lkdtm_register_cpoint(struct crashpoint *crashpoint, @@ -167,10 +172,8 @@ static int lkdtm_register_cpoint(struct crashpoint *crashpoint, int ret; /* If this doesn't have a symbol, just call immediately. */ - if (!crashpoint->kprobe.symbol_name) { - lkdtm_do_action(crashtype); - return 0; - } + if (!crashpoint->kprobe.symbol_name) + return lkdtm_do_action(crashtype); if (lkdtm_kprobe != NULL) unregister_kprobe(lkdtm_kprobe); @@ -216,7 +219,7 @@ static int lkdtm_kprobe_handler(struct kprobe *kp, struct pt_regs *regs) spin_unlock_irqrestore(&crash_count_lock, flags); if (do_it) - lkdtm_do_action(lkdtm_crashtype); + return lkdtm_do_action(lkdtm_crashtype); return 0; } @@ -303,6 +306,7 @@ static ssize_t direct_entry(struct file *f, const char __user *user_buf, { const struct crashtype *crashtype; char *buf; + int err; if (count >= PAGE_SIZE) return -EINVAL; @@ -326,9 +330,11 @@ static ssize_t direct_entry(struct file *f, const char __user *user_buf, return -EINVAL; pr_info("Performing direct entry %s\n", crashtype->name); - lkdtm_do_action(crashtype); + err = lkdtm_do_action(crashtype); *off += count; + if (err) + return err; return count; } From edb6538da3df83806fffcfc1b873d0895c81b9e8 Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Mon, 22 Jan 2024 16:49:35 -0800 Subject: [PATCH 147/707] lkdtm/bugs: Adjust lkdtm_HUNG_TASK() to avoid tail call optimization When testing with lkdtm_HUNG_TASK() and looking at the output, I expected to see lkdtm_HUNG_TASK() in the stack crawl but it wasn't there. Instead, the top function on at least some devices was schedule() due to tail call optimization. Let's do two things to help here: 1. We'll mark this as "__noreturn". On GCC at least this is documented to prevent tail call optimization. The docs [1] say "In order to preserve backtraces, GCC will never turn calls to noreturn functions into tail calls." 2. We'll add a BUG_ON(1) at the end which means that schedule() is no longer a tail call. Note that this is potentially important because if we _did_ end up returning from schedule() due to some weird issue then we'd potentially be violating the "noreturn" that we told the compiler about. BUG is the right thing to do here. [1] https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html Signed-off-by: Douglas Anderson Link: https://lore.kernel.org/r/20240122164935.2.I26e8f68c312824fcc80c19d4e91de2d2bef958f0@changeid Signed-off-by: Kees Cook --- drivers/misc/lkdtm/bugs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/misc/lkdtm/bugs.c b/drivers/misc/lkdtm/bugs.c index b080eb2335eba8..d1222d3eda2f19 100644 --- a/drivers/misc/lkdtm/bugs.c +++ b/drivers/misc/lkdtm/bugs.c @@ -294,10 +294,11 @@ static void lkdtm_SPINLOCKUP(void) __release(&lock_me_up); } -static void lkdtm_HUNG_TASK(void) +static void __noreturn lkdtm_HUNG_TASK(void) { set_current_state(TASK_UNINTERRUPTIBLE); schedule(); + BUG_ON(1); } static volatile unsigned int huge = INT_MAX - 2; From ace4b31b297dfd7b8c969ff5046c8128c3e025be Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 18 Jan 2024 16:19:13 +0530 Subject: [PATCH 148/707] cpufreq: Move dev_pm_opp_{init|free}_cpufreq_table() to pm_opp.h Move the declaration of functions defined in the OPP core to pm_opp.h. These were added to cpufreq.h as it was the only user of the APIs, but that was a mistake perhaps. Fix it. Signed-off-by: Viresh Kumar --- include/linux/cpufreq.h | 20 -------------------- include/linux/pm_opp.h | 16 ++++++++++++++++ 2 files changed, 16 insertions(+), 20 deletions(-) diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index afda5f24d3ddc6..8ff3e79727d80c 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -694,26 +694,6 @@ struct cpufreq_frequency_table { * order */ }; -#if defined(CONFIG_CPU_FREQ) && defined(CONFIG_PM_OPP) -int dev_pm_opp_init_cpufreq_table(struct device *dev, - struct cpufreq_frequency_table **table); -void dev_pm_opp_free_cpufreq_table(struct device *dev, - struct cpufreq_frequency_table **table); -#else -static inline int dev_pm_opp_init_cpufreq_table(struct device *dev, - struct cpufreq_frequency_table - **table) -{ - return -EINVAL; -} - -static inline void dev_pm_opp_free_cpufreq_table(struct device *dev, - struct cpufreq_frequency_table - **table) -{ -} -#endif - /* * cpufreq_for_each_entry - iterate over a cpufreq_frequency_table * @pos: the cpufreq_frequency_table * to use as a loop cursor. diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h index 76dcb7f37bcdff..f1ac8bde09cb56 100644 --- a/include/linux/pm_opp.h +++ b/include/linux/pm_opp.h @@ -16,6 +16,7 @@ #include struct clk; +struct cpufreq_frequency_table; struct regulator; struct dev_pm_opp; struct device; @@ -444,6 +445,21 @@ static inline int dev_pm_opp_sync_regulators(struct device *dev) #endif /* CONFIG_PM_OPP */ +#if defined(CONFIG_CPU_FREQ) && defined(CONFIG_PM_OPP) +int dev_pm_opp_init_cpufreq_table(struct device *dev, struct cpufreq_frequency_table **table); +void dev_pm_opp_free_cpufreq_table(struct device *dev, struct cpufreq_frequency_table **table); +#else +static inline int dev_pm_opp_init_cpufreq_table(struct device *dev, struct cpufreq_frequency_table **table) +{ + return -EINVAL; +} + +static inline void dev_pm_opp_free_cpufreq_table(struct device *dev, struct cpufreq_frequency_table **table) +{ +} +#endif + + #if defined(CONFIG_PM_OPP) && defined(CONFIG_OF) int dev_pm_opp_of_add_table(struct device *dev); int dev_pm_opp_of_add_table_indexed(struct device *dev, int index); From 52501486483e1646852f78f3f5af89ab573d2caf Mon Sep 17 00:00:00 2001 From: Wen Yang Date: Mon, 15 Jan 2024 23:27:00 +0800 Subject: [PATCH 149/707] eventfd: move 'eventfd-count' printing out of spinlock When printing eventfd->count, interrupts will be disabled and a spinlock will be obtained, competing with eventfd_write(). By moving the "eventfd-count" print out of the spinlock and merging multiple seq_printf() into one, it could improve a bit, just like timerfd_show(). Signed-off-by: Wen Yang Link: https://lore.kernel.org/r/tencent_B0B3D2BD9861FD009E03AB18A81783322709@qq.com Cc: Alexander Viro Cc: Jens Axboe Cc: Christian Brauner Cc: Christoph Hellwig Cc: Dylan Yudaken Cc: David Woodhouse Cc: Matthew Wilcox Cc: Eric Biggers Cc: Cc: Signed-off-by: Christian Brauner --- fs/eventfd.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/fs/eventfd.c b/fs/eventfd.c index 0252b71099fbca..fc4d8109076392 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -283,13 +283,18 @@ static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t c static void eventfd_show_fdinfo(struct seq_file *m, struct file *f) { struct eventfd_ctx *ctx = f->private_data; + __u64 cnt; spin_lock_irq(&ctx->wqh.lock); - seq_printf(m, "eventfd-count: %16llx\n", - (unsigned long long)ctx->count); + cnt = ctx->count; spin_unlock_irq(&ctx->wqh.lock); - seq_printf(m, "eventfd-id: %d\n", ctx->id); - seq_printf(m, "eventfd-semaphore: %d\n", + + seq_printf(m, + "eventfd-count: %16llx\n" + "eventfd-id: %d\n" + "eventfd-semaphore: %d\n", + cnt, + ctx->id, !!(ctx->flags & EFD_SEMAPHORE)); } #endif From 5b3d743da951aeaedda401304d88b7b6ec1969d7 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Tue, 23 Jan 2024 09:34:50 +0100 Subject: [PATCH 150/707] dt-bindings: sram: narrow regex for unit address to hex numbers Regular expression used to match the unit address part should not allow non-hex numbers. Signed-off-by: Krzysztof Kozlowski Acked-by: Jernej Skrabec Link: https://lore.kernel.org/r/20240123083450.20996-1-krzysztof.kozlowski@linaro.org Signed-off-by: Jernej Skrabec --- .../bindings/sram/allwinner,sun4i-a10-system-control.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/sram/allwinner,sun4i-a10-system-control.yaml b/Documentation/devicetree/bindings/sram/allwinner,sun4i-a10-system-control.yaml index a1c96985951ff2..cf07b8f787a6ed 100644 --- a/Documentation/devicetree/bindings/sram/allwinner,sun4i-a10-system-control.yaml +++ b/Documentation/devicetree/bindings/sram/allwinner,sun4i-a10-system-control.yaml @@ -56,7 +56,7 @@ properties: ranges: true patternProperties: - "^sram@[a-z0-9]+": + "^sram@[a-f0-9]+": $ref: /schemas/sram/sram.yaml# unevaluatedProperties: false From 26ca757780d1ba9a982f8b79aef8e4cf5d171182 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 20 Jan 2024 21:18:57 -0800 Subject: [PATCH 151/707] clk: sunxi: usb: fix kernel-doc warnings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move the function description comment to immediately above the function implementation, the add function parameter descriptions to prevent kernel-doc warnings: clk-usb.c:80: warning: expecting prototype for sunxi_usb_clk_setup(). Prototype was for SUNXI_USB_MAX_SIZE() instead clk-usb.c:91: warning: Function parameter or struct member 'node' not described in 'sunxi_usb_clk_setup' clk-usb.c:91: warning: Function parameter or struct member 'data' not described in 'sunxi_usb_clk_setup' clk-usb.c:91: warning: Function parameter or struct member 'lock' not described in 'sunxi_usb_clk_setup' Signed-off-by: Randy Dunlap Cc: Emilio López Cc: Michael Turquette Cc: Stephen Boyd Cc: Cc: Chen-Yu Tsai Cc: Jernej Skrabec Cc: Samuel Holland Cc: Cc: Reviewed-by: Andre Przywara Acked-by: Jernej Skrabec Link: https://lore.kernel.org/r/20240121051858.17647-1-rdunlap@infradead.org Signed-off-by: Jernej Skrabec --- drivers/clk/sunxi/clk-usb.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/clk/sunxi/clk-usb.c b/drivers/clk/sunxi/clk-usb.c index 5460218f3467ab..3c53f65002a285 100644 --- a/drivers/clk/sunxi/clk-usb.c +++ b/drivers/clk/sunxi/clk-usb.c @@ -73,9 +73,6 @@ static const struct reset_control_ops sunxi_usb_reset_ops = { .deassert = sunxi_usb_reset_deassert, }; -/** - * sunxi_usb_clk_setup() - Setup function for usb gate clocks - */ #define SUNXI_USB_MAX_SIZE 32 @@ -85,6 +82,12 @@ struct usb_clk_data { bool reset_needs_clk; }; +/** + * sunxi_usb_clk_setup() - Setup function for usb gate clocks + * @node: &struct device_node for the clock + * @data: &struct usb_clk_data for the clock + * @lock: spinlock for the clock + */ static void __init sunxi_usb_clk_setup(struct device_node *node, const struct usb_clk_data *data, spinlock_t *lock) From 64b4ea871b86e0177c3938360ff83d528ac10018 Mon Sep 17 00:00:00 2001 From: David Lechner Date: Wed, 10 Jan 2024 14:28:40 -0600 Subject: [PATCH 152/707] dt-bindings: iio: adc: Add binding for AD7380 ADCs This adds a binding specification for the Analog Devices Inc. AD7380 family of ADCs. Signed-off-by: David Lechner Reviewed-by: Conor Dooley Link: https://lore.kernel.org/r/20240110-ad7380-mainline-v4-1-93a1d96b50fa@baylibre.com Signed-off-by: Jonathan Cameron --- .../bindings/iio/adc/adi,ad7380.yaml | 86 +++++++++++++++++++ MAINTAINERS | 9 ++ 2 files changed, 95 insertions(+) create mode 100644 Documentation/devicetree/bindings/iio/adc/adi,ad7380.yaml diff --git a/Documentation/devicetree/bindings/iio/adc/adi,ad7380.yaml b/Documentation/devicetree/bindings/iio/adc/adi,ad7380.yaml new file mode 100644 index 00000000000000..5a70d1ee768b5a --- /dev/null +++ b/Documentation/devicetree/bindings/iio/adc/adi,ad7380.yaml @@ -0,0 +1,86 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/iio/adc/adi,ad7380.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Analog Devices Simultaneous Sampling Analog to Digital Converters + +maintainers: + - Michael Hennerich + - Nuno Sá + +description: | + * https://www.analog.com/en/products/ad7380.html + * https://www.analog.com/en/products/ad7381.html + * https://www.analog.com/en/products/ad7383.html + * https://www.analog.com/en/products/ad7384.html + +$ref: /schemas/spi/spi-peripheral-props.yaml# + +properties: + compatible: + enum: + - adi,ad7380 + - adi,ad7381 + - adi,ad7383 + - adi,ad7384 + + reg: + maxItems: 1 + + spi-max-frequency: + maximum: 80000000 + spi-cpol: true + spi-cpha: true + + vcc-supply: + description: A 3V to 3.6V supply that powers the chip. + + vlogic-supply: + description: + A 1.65V to 3.6V supply for the logic pins. + + refio-supply: + description: + A 2.5V to 3.3V supply for the external reference voltage. When omitted, + the internal 2.5V reference is used. + + interrupts: + description: + When the device is using 1-wire mode, this property is used to optionally + specify the ALERT interrupt. + maxItems: 1 + +required: + - compatible + - reg + - vcc-supply + - vlogic-supply + +unevaluatedProperties: false + +examples: + - | + #include + + spi { + #address-cells = <1>; + #size-cells = <0>; + + adc@0 { + compatible = "adi,ad7380"; + reg = <0>; + + spi-cpol; + spi-cpha; + spi-max-frequency = <80000000>; + + interrupts = <27 IRQ_TYPE_EDGE_FALLING>; + interrupt-parent = <&gpio0>; + + vcc-supply = <&supply_3_3V>; + vlogic-supply = <&supply_3_3V>; + refio-supply = <&supply_2_5V>; + }; + }; diff --git a/MAINTAINERS b/MAINTAINERS index dcf99f9f5b8402..da83fdae811b79 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -427,6 +427,15 @@ W: http://wiki.analog.com/AD7142 W: https://ez.analog.com/linux-software-drivers F: drivers/input/misc/ad714x.c +AD738X ADC DRIVER (AD7380/1/2/4) +M: Michael Hennerich +M: Nuno Sá +R: David Lechner +S: Supported +W: https://wiki.analog.com/resources/tools-software/linux-drivers/iio-adc/ad738x +W: https://ez.analog.com/linux-software-drivers +F: Documentation/devicetree/bindings/iio/adc/adi,ad7380.yaml + AD7877 TOUCHSCREEN DRIVER M: Michael Hennerich S: Supported From 6f34271e6342c50974fb4676b05d648d4560f2e1 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 19 Jan 2024 08:04:42 -0700 Subject: [PATCH 153/707] block/mq-deadline: pass in queue directly to dd_insert_request() The hardware queue isn't relevant, deadline only operates on the queue itself. Pass in the queue directly rather than the hardware queue, as that more clearly explains what is being operated on. Reviewed-by: Bart Van Assche Tested-by: Oleksandr Natalenko Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- block/mq-deadline.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/block/mq-deadline.c b/block/mq-deadline.c index f958e79277b8bc..9b7563e9d638ed 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -792,10 +792,9 @@ static bool dd_bio_merge(struct request_queue *q, struct bio *bio, /* * add rq to rbtree and fifo */ -static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, +static void dd_insert_request(struct request_queue *q, struct request *rq, blk_insert_t flags, struct list_head *free) { - struct request_queue *q = hctx->queue; struct deadline_data *dd = q->elevator->elevator_data; const enum dd_data_dir data_dir = rq_data_dir(rq); u16 ioprio = req_get_ioprio(rq); @@ -875,7 +874,7 @@ static void dd_insert_requests(struct blk_mq_hw_ctx *hctx, rq = list_first_entry(list, struct request, queuelist); list_del_init(&rq->queuelist); - dd_insert_request(hctx, rq, flags, &free); + dd_insert_request(q, rq, flags, &free); } spin_unlock(&dd->lock); From b0f6732db3959da8595586b4145cc35af3c5cb9a Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 18 Jan 2024 10:46:52 -0700 Subject: [PATCH 154/707] block/mq-deadline: serialize request dispatching If we're entering request dispatch but someone else is already dispatching, then just skip this dispatch. We know IO is inflight and this will trigger another dispatch event for any completion. This will potentially cause slightly lower queue depth for contended cases, but those are slowed down anyway and this should not cause an issue. By itself, this patch doesn't help a whole lot, as the dispatch lock contention reduction is just eating up by the same dd->lock now seeing increased insertion contention. But it's required work to be able to reduce the lock contention in general. Reviewed-by: Bart Van Assche Tested-by: Oleksandr Natalenko Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- block/mq-deadline.c | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/block/mq-deadline.c b/block/mq-deadline.c index 9b7563e9d638ed..79bc3b6784b373 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -79,10 +79,20 @@ struct dd_per_prio { struct io_stats_per_prio stats; }; +enum { + DD_DISPATCHING = 0, +}; + struct deadline_data { /* * run time data */ + struct { + spinlock_t lock; + spinlock_t zone_lock; + } ____cacheline_aligned_in_smp; + + unsigned long run_state; struct dd_per_prio per_prio[DD_PRIO_COUNT]; @@ -100,9 +110,6 @@ struct deadline_data { int front_merges; u32 async_depth; int prio_aging_expire; - - spinlock_t lock; - spinlock_t zone_lock; }; /* Maps an I/O priority class to a deadline scheduler priority. */ @@ -600,6 +607,18 @@ static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx) struct request *rq; enum dd_prio prio; + /* + * If someone else is already dispatching, skip this one. This will + * defer the next dispatch event to when something completes, and could + * potentially lower the queue depth for contended cases. + * + * See the logic in blk_mq_do_dispatch_sched(), which loops and + * retries if nothing is dispatched. + */ + if (test_bit(DD_DISPATCHING, &dd->run_state) || + test_and_set_bit_lock(DD_DISPATCHING, &dd->run_state)) + return NULL; + spin_lock(&dd->lock); rq = dd_dispatch_prio_aged_requests(dd, now); if (rq) @@ -616,6 +635,7 @@ static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx) } unlock: + clear_bit_unlock(DD_DISPATCHING, &dd->run_state); spin_unlock(&dd->lock); return rq; @@ -706,6 +726,9 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e) eq->elevator_data = dd; + spin_lock_init(&dd->lock); + spin_lock_init(&dd->zone_lock); + for (prio = 0; prio <= DD_PRIO_MAX; prio++) { struct dd_per_prio *per_prio = &dd->per_prio[prio]; @@ -722,8 +745,6 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e) dd->last_dir = DD_WRITE; dd->fifo_batch = fifo_batch; dd->prio_aging_expire = prio_aging_expire; - spin_lock_init(&dd->lock); - spin_lock_init(&dd->zone_lock); /* We dispatch from request queue wide instead of hw queue */ blk_queue_flag_set(QUEUE_FLAG_SQ_SCHED, q); From 8f764b91fdf2965956b036be6f9e79f77bd6c903 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 19 Jan 2024 08:16:36 -0700 Subject: [PATCH 155/707] block/mq-deadline: skip expensive merge lookups if contended We do several stages of merging in the block layer - the most likely one to work is also the cheap one, merging direct in the per-task plug when IO is submitted. Getting merges outside of that is a lot less likely, but IO schedulers may still maintain internal data structures to facilitate merge lookups outside of the plug. Make mq-deadline skip expensive merge lookups if the queue lock is already contended. The likelihood of getting a merge here is not very high, hence it should not be a problem skipping the attempt in the also unlikely event that the queue is already contended. Perf diff shows the difference between a random read/write workload with 4 threads doing IO, with expensive merges turned on and off: 25.00% +61.94% [kernel.kallsyms] [k] queued_spin_lock_slowpath where we almost quadruple the lock contention by attempting these expensive merges. Tested-by: Oleksandr Natalenko Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- block/mq-deadline.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/block/mq-deadline.c b/block/mq-deadline.c index 79bc3b6784b373..740b94f36cacf7 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -800,7 +800,19 @@ static bool dd_bio_merge(struct request_queue *q, struct bio *bio, struct request *free = NULL; bool ret; - spin_lock(&dd->lock); + /* + * bio merging is called for every bio queued, and it's very easy + * to run into contention because of that. If we fail getting + * the dd lock, just skip this merge attempt. For related IO, the + * plug will be the successful merging point. If we get here, we + * already failed doing the obvious merge. Chances of actually + * getting a merge off this path is a lot slimmer, so skipping an + * occassional lookup that will most likely not succeed anyway should + * not be a problem. + */ + if (!spin_trylock(&dd->lock)) + return false; + ret = blk_mq_sched_try_merge(q, bio, nr_segs, &free); spin_unlock(&dd->lock); From 574e7779cf583171acb5bf6365047bb0941b387c Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 18 Jan 2024 10:53:34 -0700 Subject: [PATCH 156/707] block/mq-deadline: use separate insertion lists Reduce lock contention on dd->lock by calling dd_insert_request() from inside the dispatch callback instead of from the insert callback. This patch is inspired by a patch from Jens. With the previous dispatch and merge optimization, this drastically reduces contention for a sample cases of 32 threads doing IO to devices. The test case looks as follows: fio --bs=512 --group_reporting=1 --gtod_reduce=1 --invalidate=1 \ --ioengine=io_uring --norandommap --runtime=60 --rw=randread \ --thread --time_based=1 --buffered=0 --fixedbufs=1 --numjobs=32 \ --iodepth=4 --iodepth_batch_submit=4 --iodepth_batch_complete=4 \ --name=scaletest --filename=/dev/$DEV Before: Device IOPS sys contention diff ==================================================== null_blk 879K 89% 93.6% nvme0n1 901K 86% 94.5% and after this and the previous two patches: Device IOPS sys contention diff ==================================================== null_blk 2867K 11.1% ~6.0% +226% nvme0n1 3162K 9.9% ~5.0% +250% which basically eliminates all of the lock contention, it's down to more normal levels. The throughput increases show that nicely, with more than a 300% improvement for both cases. Signed-off-by: Bart Van Assche Tested-by: Oleksandr Natalenko Reviewed-by: Johannes Thumshirn [axboe: expand commit message with more details and perf results] Signed-off-by: Jens Axboe --- block/mq-deadline.c | 66 ++++++++++++++++++++++++++++++++++++--------- 1 file changed, 53 insertions(+), 13 deletions(-) diff --git a/block/mq-deadline.c b/block/mq-deadline.c index 740b94f36cacf7..1b0de4fc39582b 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -89,11 +89,15 @@ struct deadline_data { */ struct { spinlock_t lock; + spinlock_t insert_lock; spinlock_t zone_lock; } ____cacheline_aligned_in_smp; unsigned long run_state; + struct list_head at_head; + struct list_head at_tail; + struct dd_per_prio per_prio[DD_PRIO_COUNT]; /* Data direction of latest dispatched request. */ @@ -120,6 +124,9 @@ static const enum dd_prio ioprio_class_to_prio[] = { [IOPRIO_CLASS_IDLE] = DD_IDLE_PRIO, }; +static void dd_insert_request(struct request_queue *q, struct request *rq, + blk_insert_t flags, struct list_head *free); + static inline struct rb_root * deadline_rb_root(struct dd_per_prio *per_prio, struct request *rq) { @@ -592,6 +599,33 @@ static struct request *dd_dispatch_prio_aged_requests(struct deadline_data *dd, return NULL; } +static void __dd_do_insert(struct request_queue *q, blk_insert_t flags, + struct list_head *list, struct list_head *free) +{ + while (!list_empty(list)) { + struct request *rq; + + rq = list_first_entry(list, struct request, queuelist); + list_del_init(&rq->queuelist); + dd_insert_request(q, rq, flags, free); + } +} + +static void dd_do_insert(struct request_queue *q, struct list_head *free) +{ + struct deadline_data *dd = q->elevator->elevator_data; + LIST_HEAD(at_head); + LIST_HEAD(at_tail); + + spin_lock(&dd->insert_lock); + list_splice_init(&dd->at_head, &at_head); + list_splice_init(&dd->at_tail, &at_tail); + spin_unlock(&dd->insert_lock); + + __dd_do_insert(q, BLK_MQ_INSERT_AT_HEAD, &at_head, free); + __dd_do_insert(q, 0, &at_tail, free); +} + /* * Called from blk_mq_run_hw_queue() -> __blk_mq_sched_dispatch_requests(). * @@ -602,10 +636,12 @@ static struct request *dd_dispatch_prio_aged_requests(struct deadline_data *dd, */ static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx) { - struct deadline_data *dd = hctx->queue->elevator->elevator_data; + struct request_queue *q = hctx->queue; + struct deadline_data *dd = q->elevator->elevator_data; const unsigned long now = jiffies; struct request *rq; enum dd_prio prio; + LIST_HEAD(free); /* * If someone else is already dispatching, skip this one. This will @@ -620,6 +656,7 @@ static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx) return NULL; spin_lock(&dd->lock); + dd_do_insert(q, &free); rq = dd_dispatch_prio_aged_requests(dd, now); if (rq) goto unlock; @@ -638,6 +675,7 @@ static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx) clear_bit_unlock(DD_DISPATCHING, &dd->run_state); spin_unlock(&dd->lock); + blk_mq_free_requests(&free); return rq; } @@ -727,8 +765,12 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e) eq->elevator_data = dd; spin_lock_init(&dd->lock); + spin_lock_init(&dd->insert_lock); spin_lock_init(&dd->zone_lock); + INIT_LIST_HEAD(&dd->at_head); + INIT_LIST_HEAD(&dd->at_tail); + for (prio = 0; prio <= DD_PRIO_MAX; prio++) { struct dd_per_prio *per_prio = &dd->per_prio[prio]; @@ -899,19 +941,13 @@ static void dd_insert_requests(struct blk_mq_hw_ctx *hctx, { struct request_queue *q = hctx->queue; struct deadline_data *dd = q->elevator->elevator_data; - LIST_HEAD(free); - - spin_lock(&dd->lock); - while (!list_empty(list)) { - struct request *rq; - rq = list_first_entry(list, struct request, queuelist); - list_del_init(&rq->queuelist); - dd_insert_request(q, rq, flags, &free); - } - spin_unlock(&dd->lock); - - blk_mq_free_requests(&free); + spin_lock(&dd->insert_lock); + if (flags & BLK_MQ_INSERT_AT_HEAD) + list_splice_init(list, &dd->at_head); + else + list_splice_init(list, &dd->at_tail); + spin_unlock(&dd->insert_lock); } /* Callback from inside blk_mq_rq_ctx_init(). */ @@ -990,6 +1026,10 @@ static bool dd_has_work(struct blk_mq_hw_ctx *hctx) struct deadline_data *dd = hctx->queue->elevator->elevator_data; enum dd_prio prio; + if (!list_empty_careful(&dd->at_head) || + !list_empty_careful(&dd->at_tail)) + return true; + for (prio = 0; prio <= DD_PRIO_MAX; prio++) if (dd_has_work_for_prio(&dd->per_prio[prio])) return true; From 88b61090083299c3c88162c357cabd30c1c61698 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 20 Jan 2024 14:50:58 -0700 Subject: [PATCH 157/707] block/bfq: pass in queue directly to bfq_insert_request() The hardware queue isn't relevant, bfq only operates on the queue itself. Pass in the queue directly rather than the hardware queue, as that more clearly explains what is being operated on. Reviewed-by: Bart Van Assche Tested-by: Oleksandr Natalenko Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 3cce6de464a7b7..7d08442474ec31 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -6236,10 +6236,9 @@ static inline void bfq_update_insert_stats(struct request_queue *q, static struct bfq_queue *bfq_init_rq(struct request *rq); -static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, +static void bfq_insert_request(struct request_queue *q, struct request *rq, blk_insert_t flags) { - struct request_queue *q = hctx->queue; struct bfq_data *bfqd = q->elevator->elevator_data; struct bfq_queue *bfqq; bool idle_timer_disabled = false; @@ -6301,7 +6300,7 @@ static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx, rq = list_first_entry(list, struct request, queuelist); list_del_init(&rq->queuelist); - bfq_insert_request(hctx, rq, flags); + bfq_insert_request(hctx->queue, rq, flags); } } From b1daebfc62466586bd256c98def34a21461ed128 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 20 Jan 2024 14:54:37 -0700 Subject: [PATCH 158/707] block/bfq: serialize request dispatching If we're entering request dispatch but someone else is already dispatching, then just skip this dispatch. We know IO is inflight and this will trigger another dispatch event for any completion. This will potentially cause slightly lower queue depth for contended cases, but those are slowed down anyway and this should not cause an issue. By itself, this patch doesn't help a whole lot, as the dispatch lock contention reduction is just eaten up by the same bfqd->lock now seeing increased insertion contention. But it's required work to be able to reduce the lock contention in general. Tested-by: Oleksandr Natalenko Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 17 +++++++++++++++-- block/bfq-iosched.h | 12 ++++++++++-- 2 files changed, 25 insertions(+), 4 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 7d08442474ec31..5ef4a4eba572c8 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -5304,6 +5304,18 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) struct bfq_queue *in_serv_queue; bool waiting_rq, idle_timer_disabled = false; + /* + * If someone else is already dispatching, skip this one. This will + * defer the next dispatch event to when something completes, and could + * potentially lower the queue depth for contended cases. + * + * See the logic in blk_mq_do_dispatch_sched(), which loops and + * retries if nothing is dispatched. + */ + if (test_bit(BFQ_DISPATCHING, &bfqd->run_state) || + test_and_set_bit_lock(BFQ_DISPATCHING, &bfqd->run_state)) + return NULL; + spin_lock_irq(&bfqd->lock); in_serv_queue = bfqd->in_service_queue; @@ -5315,6 +5327,7 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) waiting_rq && !bfq_bfqq_wait_request(in_serv_queue); } + clear_bit_unlock(BFQ_DISPATCHING, &bfqd->run_state); spin_unlock_irq(&bfqd->lock); bfq_update_dispatch_stats(hctx->queue, rq, idle_timer_disabled ? in_serv_queue : NULL, @@ -7210,6 +7223,8 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) q->elevator = eq; spin_unlock_irq(&q->queue_lock); + spin_lock_init(&bfqd->lock); + /* * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues. * Grab a permanent reference to it, so that the normal code flow @@ -7328,8 +7343,6 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) /* see comments on the definition of next field inside bfq_data */ bfqd->actuator_load_threshold = 4; - spin_lock_init(&bfqd->lock); - /* * The invocation of the next bfq_create_group_hierarchy * function is the head of a chain of function calls diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index 467e8cfc41a249..56ff69f2216327 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -504,12 +504,22 @@ struct bfq_io_cq { unsigned int requests; /* Number of requests this process has in flight */ }; +enum { + BFQ_DISPATCHING = 0, +}; + /** * struct bfq_data - per-device data structure. * * All the fields are protected by @lock. */ struct bfq_data { + struct { + spinlock_t lock; + } ____cacheline_aligned_in_smp; + + unsigned long run_state; + /* device request queue */ struct request_queue *queue; /* dispatch queue */ @@ -795,8 +805,6 @@ struct bfq_data { /* fallback dummy bfqq for extreme OOM conditions */ struct bfq_queue oom_bfqq; - spinlock_t lock; - /* * bic associated with the task issuing current bio for * merging. This and the next field are used as a support to From 58ea8280b7a56f541ceba62efef77babaf45a680 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 20 Jan 2024 14:58:37 -0700 Subject: [PATCH 159/707] block/bfq: skip expensive merge lookups if contended We do several stages of merging in the block layer - the most likely one to work is also the cheap one, merging direct in the per-task plug when IO is submitted. Getting merges outside of that is a lot less likely, but IO schedulers may still maintain internal data structures to facilitate merge lookups outside of the plug. Make BFQ skip expensive merge lookups if the queue lock or bfqd lock is already contended. The likelihood of getting a merge here is not very high, hence it should not be a problem skipping the attempt in the also unlikely event that either the queue or bfqd are already contended. Perf diff shows the difference between a random read/write workload with 4 threads doing IO, with expensive merges turned on and off: 31.70% +54.80% [kernel.kallsyms] [k] queued_spin_lock_slowpath where we almost triple the lock contention (~32% -> ~87%) by attempting these expensive merges, and performance drops from 1630K to 1050K IOPS. At the same time, sys time drops from 37% to 14%. Tested-by: Oleksandr Natalenko Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 5ef4a4eba572c8..ea16a0c5308263 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -467,6 +467,21 @@ static struct bfq_io_cq *bfq_bic_lookup(struct request_queue *q) return icq; } +static struct bfq_io_cq *bfq_bic_try_lookup(struct request_queue *q) +{ + if (!current->io_context) + return NULL; + if (spin_trylock_irq(&q->queue_lock)) { + struct bfq_io_cq *icq; + + icq = icq_to_bic(ioc_lookup_icq(q)); + spin_unlock_irq(&q->queue_lock); + return icq; + } + + return NULL; +} + /* * Scheduler run of queue, if there are requests pending and no one in the * driver that will restart queueing. @@ -2454,10 +2469,21 @@ static bool bfq_bio_merge(struct request_queue *q, struct bio *bio, * returned by bfq_bic_lookup does not go away before * bfqd->lock is taken. */ - struct bfq_io_cq *bic = bfq_bic_lookup(q); + struct bfq_io_cq *bic = bfq_bic_try_lookup(q); bool ret; - spin_lock_irq(&bfqd->lock); + /* + * bio merging is called for every bio queued, and it's very easy + * to run into contention because of that. If we fail getting + * the dd lock, just skip this merge attempt. For related IO, the + * plug will be the successful merging point. If we get here, we + * already failed doing the obvious merge. Chances of actually + * getting a merge off this path is a lot slimmer, so skipping an + * occassional lookup that will most likely not succeed anyway should + * not be a problem. + */ + if (!spin_trylock_irq(&bfqd->lock)) + return false; if (bic) { /* From 1ee906ccc23b027b28505b84a37eb5c9e75db9ae Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 20 Jan 2024 15:04:51 -0700 Subject: [PATCH 160/707] block/bfq: use separate insertion lists Based on the similar patch for mq-deadline, this uses separate insertion lists so we can defer touching dd->lock until dispatch time. This improves the following fio workload: fio --bs=512 --group_reporting=1 --gtod_reduce=1 --invalidate=1 \ --ioengine=io_uring --norandommap --runtime=60 --rw=randread \ --thread --time_based=1 --buffered=0 --fixedbufs=1 --numjobs=32 \ --iodepth=4 --iodepth_batch_submit=4 --iodepth_batch_complete=4 \ --name=/dev/nvme0n1 --filename=/dev/nvme0n1 from: /dev/nvme0n1: (groupid=0, jobs=32): err= 0: pid=1113: Fri Jan 19 20:59:26 2024 read: IOPS=567k, BW=277MiB/s (290MB/s)(1820MiB/6575msec) bw ( KiB/s): min=274824, max=291156, per=100.00%, avg=283930.08, stdev=143.01, samples=416 iops : min=549648, max=582312, avg=567860.31, stdev=286.01, samples=416 cpu : usr=0.18%, sys=86.04%, ctx=866079, majf=0, minf=0 IO depths : 1=0.0%, 2=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, >=64=0.0% submit : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0% complete : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0% issued rwts: total=3728344,0,0,0 short=0,0,0,0 dropped=0,0,0,0 latency : target=0, window=0, percentile=100.00%, depth=4 with 96% lock contention and 86% sys time, to: /dev/nvme0n1: (groupid=0, jobs=32): err= 0: pid=8922: Sat Jan 20 11:16:20 2024 read: IOPS=1550k, BW=757MiB/s (794MB/s)(19.6GiB/26471msec) bw ( KiB/s): min=754668, max=848896, per=100.00%, avg=775459.33, stdev=624.43, samples=1664 iops : min=1509336, max=1697793, avg=1550918.83, stdev=1248.87, samples=1664 cpu : usr=1.34%, sys=14.49%, ctx=9950560, majf=0, minf=0 IO depths : 1=0.0%, 2=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, >=64=0.0% submit : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0% complete : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0% issued rwts: total=41042924,0,0,0 short=0,0,0,0 dropped=0,0,0,0 latency : target=0, window=0, percentile=100.00%, depth=4 with ~30% lock contention and 14.5% sys time, by applying the lessons learnt with scaling mq-deadline. Tested-by: Oleksandr Natalenko Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 70 ++++++++++++++++++++++++++++++++++----------- block/bfq-iosched.h | 4 +++ 2 files changed, 58 insertions(+), 16 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index ea16a0c5308263..9bd57baa4b0b93 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -5174,6 +5174,10 @@ static bool bfq_has_work(struct blk_mq_hw_ctx *hctx) { struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; + if (!list_empty_careful(&bfqd->at_head) || + !list_empty_careful(&bfqd->at_tail)) + return true; + /* * Avoiding lock: a race on bfqd->queued should cause at * most a call to dispatch for nothing @@ -5323,12 +5327,44 @@ static inline void bfq_update_dispatch_stats(struct request_queue *q, bool idle_timer_disabled) {} #endif /* CONFIG_BFQ_CGROUP_DEBUG */ +static void bfq_insert_request(struct request_queue *q, struct request *rq, + blk_insert_t flags, struct list_head *free); + +static void __bfq_do_insert(struct request_queue *q, blk_insert_t flags, + struct list_head *list, struct list_head *free) +{ + while (!list_empty(list)) { + struct request *rq; + + rq = list_first_entry(list, struct request, queuelist); + list_del_init(&rq->queuelist); + bfq_insert_request(q, rq, flags, free); + } +} + +static void bfq_do_insert(struct request_queue *q, struct list_head *free) +{ + struct bfq_data *bfqd = q->elevator->elevator_data; + LIST_HEAD(at_head); + LIST_HEAD(at_tail); + + spin_lock(&bfqd->insert_lock); + list_splice_init(&bfqd->at_head, &at_head); + list_splice_init(&bfqd->at_tail, &at_tail); + spin_unlock(&bfqd->insert_lock); + + __bfq_do_insert(q, BLK_MQ_INSERT_AT_HEAD, &at_head, free); + __bfq_do_insert(q, 0, &at_tail, free); +} + static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) { - struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; + struct request_queue *q = hctx->queue; + struct bfq_data *bfqd = q->elevator->elevator_data; struct request *rq; struct bfq_queue *in_serv_queue; bool waiting_rq, idle_timer_disabled = false; + LIST_HEAD(free); /* * If someone else is already dispatching, skip this one. This will @@ -5344,6 +5380,8 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) spin_lock_irq(&bfqd->lock); + bfq_do_insert(hctx->queue, &free); + in_serv_queue = bfqd->in_service_queue; waiting_rq = in_serv_queue && bfq_bfqq_wait_request(in_serv_queue); @@ -5355,6 +5393,7 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) clear_bit_unlock(BFQ_DISPATCHING, &bfqd->run_state); spin_unlock_irq(&bfqd->lock); + blk_mq_free_requests(&free); bfq_update_dispatch_stats(hctx->queue, rq, idle_timer_disabled ? in_serv_queue : NULL, idle_timer_disabled); @@ -6276,25 +6315,20 @@ static inline void bfq_update_insert_stats(struct request_queue *q, static struct bfq_queue *bfq_init_rq(struct request *rq); static void bfq_insert_request(struct request_queue *q, struct request *rq, - blk_insert_t flags) + blk_insert_t flags, struct list_head *free) { struct bfq_data *bfqd = q->elevator->elevator_data; struct bfq_queue *bfqq; bool idle_timer_disabled = false; blk_opf_t cmd_flags; - LIST_HEAD(free); #ifdef CONFIG_BFQ_GROUP_IOSCHED if (!cgroup_subsys_on_dfl(io_cgrp_subsys) && rq->bio) bfqg_stats_update_legacy_io(q, rq); #endif - spin_lock_irq(&bfqd->lock); bfqq = bfq_init_rq(rq); - if (blk_mq_sched_try_insert_merge(q, rq, &free)) { - spin_unlock_irq(&bfqd->lock); - blk_mq_free_requests(&free); + if (blk_mq_sched_try_insert_merge(q, rq, free)) return; - } trace_block_rq_insert(rq); @@ -6324,8 +6358,6 @@ static void bfq_insert_request(struct request_queue *q, struct request *rq, * merge). */ cmd_flags = rq->cmd_flags; - spin_unlock_irq(&bfqd->lock); - bfq_update_insert_stats(q, bfqq, idle_timer_disabled, cmd_flags); } @@ -6334,13 +6366,15 @@ static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx, struct list_head *list, blk_insert_t flags) { - while (!list_empty(list)) { - struct request *rq; + struct request_queue *q = hctx->queue; + struct bfq_data *bfqd = q->elevator->elevator_data; - rq = list_first_entry(list, struct request, queuelist); - list_del_init(&rq->queuelist); - bfq_insert_request(hctx->queue, rq, flags); - } + spin_lock_irq(&bfqd->insert_lock); + if (flags & BLK_MQ_INSERT_AT_HEAD) + list_splice_init(list, &bfqd->at_head); + else + list_splice_init(list, &bfqd->at_tail); + spin_unlock_irq(&bfqd->insert_lock); } static void bfq_update_hw_tag(struct bfq_data *bfqd) @@ -7250,6 +7284,10 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) spin_unlock_irq(&q->queue_lock); spin_lock_init(&bfqd->lock); + spin_lock_init(&bfqd->insert_lock); + + INIT_LIST_HEAD(&bfqd->at_head); + INIT_LIST_HEAD(&bfqd->at_tail); /* * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues. diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index 56ff69f2216327..f44f5d4ec2f4a2 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -516,10 +516,14 @@ enum { struct bfq_data { struct { spinlock_t lock; + spinlock_t insert_lock; } ____cacheline_aligned_in_smp; unsigned long run_state; + struct list_head at_head; + struct list_head at_tail; + /* device request queue */ struct request_queue *queue; /* dispatch queue */ From 18975ce057997fdee8b3d7df2b36f3144856fe24 Mon Sep 17 00:00:00 2001 From: Terry Tritton Date: Wed, 24 Jan 2024 14:13:55 +0000 Subject: [PATCH 161/707] selftests/seccomp: Handle EINVAL on unshare(CLONE_NEWPID) unshare(CLONE_NEWPID) can return EINVAL if the kernel does not have the CONFIG_PID_NS option enabled. Add a check on these calls to skip the test if we receive EINVAL. Signed-off-by: Terry Tritton Link: https://lore.kernel.org/r/20240124141357.1243457-2-terry.tritton@linaro.org Signed-off-by: Kees Cook --- tools/testing/selftests/seccomp/seccomp_bpf.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c index 38f6514699682b..5e705674b70670 100644 --- a/tools/testing/selftests/seccomp/seccomp_bpf.c +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c @@ -3709,7 +3709,12 @@ TEST(user_notification_sibling_pid_ns) ASSERT_GE(pid, 0); if (pid == 0) { - ASSERT_EQ(unshare(CLONE_NEWPID), 0); + ASSERT_EQ(unshare(CLONE_NEWPID), 0) { + if (errno == EPERM) + SKIP(return, "CLONE_NEWPID requires CAP_SYS_ADMIN"); + else if (errno == EINVAL) + SKIP(return, "CLONE_NEWPID is invalid (missing CONFIG_PID_NS?)"); + } pid2 = fork(); ASSERT_GE(pid2, 0); @@ -3727,6 +3732,8 @@ TEST(user_notification_sibling_pid_ns) ASSERT_EQ(unshare(CLONE_NEWPID), 0) { if (errno == EPERM) SKIP(return, "CLONE_NEWPID requires CAP_SYS_ADMIN"); + else if (errno == EINVAL) + SKIP(return, "CLONE_NEWPID is invalid (missing CONFIG_PID_NS?)"); } ASSERT_EQ(errno, 0); From fbcdf41167fea3d7d17a9ae6ffa79e2afb41881a Mon Sep 17 00:00:00 2001 From: Terry Tritton Date: Wed, 24 Jan 2024 14:13:56 +0000 Subject: [PATCH 162/707] selftests/seccomp: Change the syscall used in KILL_THREAD test The Bionic version of pthread_create used on Android calls the prctl function to give the stack and thread local storage a useful name. This will cause the KILL_THREAD test to fail as it will kill the thread as soon as it is created. change the test to use getpid instead of prctl. Signed-off-by: Terry Tritton Link: https://lore.kernel.org/r/20240124141357.1243457-3-terry.tritton@linaro.org Signed-off-by: Kees Cook --- tools/testing/selftests/seccomp/seccomp_bpf.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c index 5e705674b70670..da11b95b88721b 100644 --- a/tools/testing/selftests/seccomp/seccomp_bpf.c +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c @@ -784,7 +784,7 @@ void *kill_thread(void *data) bool die = (bool)data; if (die) { - prctl(PR_GET_SECCOMP, 0, 0, 0, 0); + syscall(__NR_getpid); return (void *)SIBLING_EXIT_FAILURE; } @@ -803,11 +803,11 @@ void kill_thread_or_group(struct __test_metadata *_metadata, { pthread_t thread; void *status; - /* Kill only when calling __NR_prctl. */ + /* Kill only when calling __NR_getpid. */ struct sock_filter filter_thread[] = { BPF_STMT(BPF_LD|BPF_W|BPF_ABS, offsetof(struct seccomp_data, nr)), - BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_prctl, 0, 1), + BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 0, 1), BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL_THREAD), BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), }; @@ -819,7 +819,7 @@ void kill_thread_or_group(struct __test_metadata *_metadata, struct sock_filter filter_process[] = { BPF_STMT(BPF_LD|BPF_W|BPF_ABS, offsetof(struct seccomp_data, nr)), - BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_prctl, 0, 1), + BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 0, 1), BPF_STMT(BPF_RET|BPF_K, kill), BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), }; From 0c6f28a844311b8f81b4b117f586189c704d3f33 Mon Sep 17 00:00:00 2001 From: Terry Tritton Date: Wed, 24 Jan 2024 14:13:57 +0000 Subject: [PATCH 163/707] selftests/seccomp: user_notification_addfd check nextfd is available Currently the user_notification_addfd test checks what the next expected file descriptor will be by incrementing a variable nextfd. This does not account for file descriptors that may already be open before the test is started and will cause the test to fail if any exist. Replace nextfd++ with a function get_next_fd which will check and return the next available file descriptor. Signed-off-by: Terry Tritton Link: https://lore.kernel.org/r/20240124141357.1243457-4-terry.tritton@linaro.org Signed-off-by: Kees Cook --- tools/testing/selftests/seccomp/seccomp_bpf.c | 24 +++++++++++++++---- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c index da11b95b88721b..cacf6507f69055 100644 --- a/tools/testing/selftests/seccomp/seccomp_bpf.c +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c @@ -4044,6 +4044,16 @@ TEST(user_notification_filter_empty_threaded) EXPECT_GT((pollfd.revents & POLLHUP) ?: 0, 0); } + +int get_next_fd(int prev_fd) +{ + for (int i = prev_fd + 1; i < FD_SETSIZE; ++i) { + if (fcntl(i, F_GETFD) == -1) + return i; + } + _exit(EXIT_FAILURE); +} + TEST(user_notification_addfd) { pid_t pid; @@ -4060,7 +4070,7 @@ TEST(user_notification_addfd) /* There may be arbitrary already-open fds at test start. */ memfd = memfd_create("test", 0); ASSERT_GE(memfd, 0); - nextfd = memfd + 1; + nextfd = get_next_fd(memfd); ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); ASSERT_EQ(0, ret) { @@ -4071,7 +4081,8 @@ TEST(user_notification_addfd) /* Check that the basic notification machinery works */ listener = user_notif_syscall(__NR_getppid, SECCOMP_FILTER_FLAG_NEW_LISTENER); - ASSERT_EQ(listener, nextfd++); + ASSERT_EQ(listener, nextfd); + nextfd = get_next_fd(nextfd); pid = fork(); ASSERT_GE(pid, 0); @@ -4126,14 +4137,16 @@ TEST(user_notification_addfd) /* Verify we can set an arbitrary remote fd */ fd = ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd); - EXPECT_EQ(fd, nextfd++); + EXPECT_EQ(fd, nextfd); + nextfd = get_next_fd(nextfd); EXPECT_EQ(filecmp(getpid(), pid, memfd, fd), 0); /* Verify we can set an arbitrary remote fd with large size */ memset(&big, 0x0, sizeof(big)); big.addfd = addfd; fd = ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD_BIG, &big); - EXPECT_EQ(fd, nextfd++); + EXPECT_EQ(fd, nextfd); + nextfd = get_next_fd(nextfd); /* Verify we can set a specific remote fd */ addfd.newfd = 42; @@ -4171,7 +4184,8 @@ TEST(user_notification_addfd) * Child has earlier "low" fds and now 42, so we expect the next * lowest available fd to be assigned here. */ - EXPECT_EQ(fd, nextfd++); + EXPECT_EQ(fd, nextfd); + nextfd = get_next_fd(nextfd); ASSERT_EQ(filecmp(getpid(), pid, memfd, fd), 0); /* From 367122c529f35b4655acbe33c0cc4d6d3b32ba71 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Wed, 24 Jan 2024 15:13:40 -0300 Subject: [PATCH 164/707] libfs: Attempt exact-match comparison first during casefolded lookup Casefolded comparisons are (obviously) way more costly than a simple memcmp. Try the case-sensitive comparison first, falling-back to the case-insensitive lookup only when needed. This allows any exact-match lookup to complete without having to walk the utf8 trie. Note that, for strict mode, generic_ci_d_compare used to reject an invalid UTF-8 string, which would now be considered valid if it exact-matches the disk-name. But, if that is the case, the filesystem is corrupt. More than that, it really doesn't matter in practice, because the name-under-lookup will have already been rejected by generic_ci_d_hash and we won't even get here. The memcmp is safe under RCU because we are operating on str/len instead of dentry->d_name directly, and the caller guarantees their consistency between each other in __d_lookup_rcu_op_compare. Link: https://lore.kernel.org/r/87ttn2sip7.fsf_-_@mailhost.krisman.be Suggested-by: Linus Torvalds Signed-off-by: Gabriel Krisman Bertazi --- fs/libfs.c | 40 +++++++++++++++++++++++----------------- 1 file changed, 23 insertions(+), 17 deletions(-) diff --git a/fs/libfs.c b/fs/libfs.c index eec6031b015544..306a0510b7dc25 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -1704,16 +1704,28 @@ bool is_empty_dir_inode(struct inode *inode) static int generic_ci_d_compare(const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { - const struct dentry *parent = READ_ONCE(dentry->d_parent); - const struct inode *dir = READ_ONCE(parent->d_inode); - const struct super_block *sb = dentry->d_sb; - const struct unicode_map *um = sb->s_encoding; - struct qstr qstr = QSTR_INIT(str, len); + const struct dentry *parent; + const struct inode *dir; char strbuf[DNAME_INLINE_LEN]; - int ret; + struct qstr qstr; + + /* + * Attempt a case-sensitive match first. It is cheaper and + * should cover most lookups, including all the sane + * applications that expect a case-sensitive filesystem. + * + * This comparison is safe under RCU because the caller + * guarantees the consistency between str and len. See + * __d_lookup_rcu_op_compare() for details. + */ + if (len == name->len && !memcmp(str, name->name, len)) + return 0; + parent = READ_ONCE(dentry->d_parent); + dir = READ_ONCE(parent->d_inode); if (!dir || !IS_CASEFOLDED(dir)) - goto fallback; + return 1; + /* * If the dentry name is stored in-line, then it may be concurrently * modified by a rename. If this happens, the VFS will eventually retry @@ -1724,20 +1736,14 @@ static int generic_ci_d_compare(const struct dentry *dentry, unsigned int len, if (len <= DNAME_INLINE_LEN - 1) { memcpy(strbuf, str, len); strbuf[len] = 0; - qstr.name = strbuf; + str = strbuf; /* prevent compiler from optimizing out the temporary buffer */ barrier(); } - ret = utf8_strncasecmp(um, name, &qstr); - if (ret >= 0) - return ret; + qstr.len = len; + qstr.name = str; - if (sb_has_strict_encoding(sb)) - return -EINVAL; -fallback: - if (len != name->len) - return 1; - return !!memcmp(str, name->name, len); + return utf8_strncasecmp(dentry->d_sb->s_encoding, name, &qstr); } /** From 0b3bbd8f9baf245ec77d86f6f5bc902105b4bfa9 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 22 Dec 2023 21:05:11 -0800 Subject: [PATCH 165/707] counter: linux/counter.h: fix Excess kernel-doc description warning Remove the @priv: line to prevent the kernel-doc warning: include/linux/counter.h:400: warning: Excess struct member 'priv' description in 'counter_device' Signed-off-by: Randy Dunlap Fixes: f2ee4759fb70 ("counter: remove old and now unused registration API") Link: https://lore.kernel.org/r/20231223050511.13849-1-rdunlap@infradead.org Signed-off-by: William Breathitt Gray --- include/linux/counter.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/counter.h b/include/linux/counter.h index 702e9108bbb44e..b767b5c821f58e 100644 --- a/include/linux/counter.h +++ b/include/linux/counter.h @@ -359,7 +359,6 @@ struct counter_ops { * @num_counts: number of Counts specified in @counts * @ext: optional array of Counter device extensions * @num_ext: number of Counter device extensions specified in @ext - * @priv: optional private data supplied by driver * @dev: internal device structure * @chrdev: internal character device structure * @events_list: list of current watching Counter events From 0410516aaef4bf08030845547fb94f4ff989fac0 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 24 Jan 2024 16:02:16 +0200 Subject: [PATCH 166/707] x86/mm: Fix memory encryption features advertisement When memory encryption is enabled, the kernel prints the encryption flavor that the system supports. The check assumes that everything is AMD SME/SEV if it doesn't have the TDX CPU feature set. Hyper-V vTOM sets cc_vendor to CC_VENDOR_INTEL when it runs as L2 guest on top of TDX, but not X86_FEATURE_TDX_GUEST. Hyper-V only needs memory encryption enabled for I/O without the rest of CoCo enabling. To avoid confusion, check the cc_vendor directly. [ bp: Massage commit message. ] Signed-off-by: Kirill A. Shutemov Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Jeremi Piotrowski Reviewed-by: Kuppuswamy Sathyanarayanan Acked-by: Tom Lendacky Acked-by: Kai Huang Link: https://lore.kernel.org/r/20240124140217.533748-1-kirill.shutemov@linux.intel.com --- arch/x86/mm/mem_encrypt.c | 56 +++++++++++++++++++++------------------ 1 file changed, 30 insertions(+), 26 deletions(-) diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index c290c55b632bd7..d035bce3a2b020 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -42,38 +42,42 @@ bool force_dma_unencrypted(struct device *dev) static void print_mem_encrypt_feature_info(void) { - pr_info("Memory Encryption Features active:"); + pr_info("Memory Encryption Features active: "); - if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) { - pr_cont(" Intel TDX\n"); - return; - } - - pr_cont(" AMD"); + switch (cc_vendor) { + case CC_VENDOR_INTEL: + pr_cont("Intel TDX\n"); + break; + case CC_VENDOR_AMD: + pr_cont("AMD"); - /* Secure Memory Encryption */ - if (cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) { + /* Secure Memory Encryption */ + if (cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) { /* * SME is mutually exclusive with any of the SEV * features below. - */ - pr_cont(" SME\n"); - return; + */ + pr_cont(" SME\n"); + return; + } + + /* Secure Encrypted Virtualization */ + if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) + pr_cont(" SEV"); + + /* Encrypted Register State */ + if (cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT)) + pr_cont(" SEV-ES"); + + /* Secure Nested Paging */ + if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) + pr_cont(" SEV-SNP"); + + pr_cont("\n"); + break; + default: + pr_cont("Unknown\n"); } - - /* Secure Encrypted Virtualization */ - if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) - pr_cont(" SEV"); - - /* Encrypted Register State */ - if (cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT)) - pr_cont(" SEV-ES"); - - /* Secure Nested Paging */ - if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) - pr_cont(" SEV-SNP"); - - pr_cont("\n"); } /* Architecture __weak replacement functions */ From 653b325d96f0f71b5baad0f192b0bc7f08f6243d Mon Sep 17 00:00:00 2001 From: Dani Liberman Date: Thu, 21 Sep 2023 17:02:33 +0300 Subject: [PATCH 167/707] accel/habanalabs/gaudi2: add interrupt affinity for user interrupts User interrupts are MSIx interrupts coming from Gaudi2, that have specific range of IDs and are assigned to the sole use of the user process that opened the Gaudi2 device (reminder: there can be only a single user process running on Gaudi2 at any given time). The interrupts are allocated and managed by the driver and therefore, the user expects the driver to initialize them properly, which also includes setting the affinity to the related CPU cores of the device's NUMA node to get maximum performance. Signed-off-by: Dani Liberman Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/accel/habanalabs/common/device.c | 32 ++++++++++++++++++++ drivers/accel/habanalabs/common/habanalabs.h | 5 +++ drivers/accel/habanalabs/gaudi2/gaudi2.c | 5 +++ 3 files changed, 42 insertions(+) diff --git a/drivers/accel/habanalabs/common/device.c b/drivers/accel/habanalabs/common/device.c index a73bd4be94b156..5eacbc73f1bb98 100644 --- a/drivers/accel/habanalabs/common/device.c +++ b/drivers/accel/habanalabs/common/device.c @@ -2801,3 +2801,35 @@ void hl_enable_err_info_capture(struct hl_error_info *captured_err_info) atomic_set(&captured_err_info->cs_timeout.write_enable, 1); captured_err_info->undef_opcode.write_enable = true; } + +void hl_init_cpu_for_irq(struct hl_device *hdev) +{ +#ifdef CONFIG_NUMA + struct cpumask *available_mask = &hdev->irq_affinity_mask; + int numa_node = hdev->pdev->dev.numa_node, i; + static struct cpumask cpu_mask; + + if (numa_node < 0) + return; + + if (!cpumask_and(&cpu_mask, cpumask_of_node(numa_node), cpu_online_mask)) { + dev_err(hdev->dev, "No available affinities in current numa node\n"); + return; + } + + /* Remove HT siblings */ + for_each_cpu(i, &cpu_mask) + cpumask_set_cpu(cpumask_first(topology_sibling_cpumask(i)), available_mask); +#endif +} + +void hl_set_irq_affinity(struct hl_device *hdev, int irq) +{ + if (cpumask_empty(&hdev->irq_affinity_mask)) { + dev_dbg(hdev->dev, "affinity mask is empty\n"); + return; + } + + if (irq_set_affinity_hint(irq, &hdev->irq_affinity_mask)) + dev_err(hdev->dev, "Failed setting irq %d affinity\n", irq); +} diff --git a/drivers/accel/habanalabs/common/habanalabs.h b/drivers/accel/habanalabs/common/habanalabs.h index 2a900c9941fee6..b1a7b229e16160 100644 --- a/drivers/accel/habanalabs/common/habanalabs.h +++ b/drivers/accel/habanalabs/common/habanalabs.h @@ -3257,6 +3257,7 @@ struct hl_reset_info { * @clk_throttling: holds information about current/previous clock throttling events * @captured_err_info: holds information about errors. * @reset_info: holds current device reset information. + * @irq_affinity_mask: mask of available CPU cores for user and decoder interrupt handling. * @stream_master_qid_arr: pointer to array with QIDs of master streams. * @fw_inner_major_ver: the major of current loaded preboot inner version. * @fw_inner_minor_ver: the minor of current loaded preboot inner version. @@ -3446,6 +3447,8 @@ struct hl_device { struct hl_reset_info reset_info; + cpumask_t irq_affinity_mask; + u32 *stream_master_qid_arr; u32 fw_inner_major_ver; u32 fw_inner_minor_ver; @@ -4032,6 +4035,8 @@ void hl_handle_critical_hw_err(struct hl_device *hdev, u16 event_id, u64 *event_ void hl_handle_fw_err(struct hl_device *hdev, struct hl_info_fw_err_info *info); void hl_capture_engine_err(struct hl_device *hdev, u16 engine_id, u16 error_count); void hl_enable_err_info_capture(struct hl_error_info *captured_err_info); +void hl_init_cpu_for_irq(struct hl_device *hdev); +void hl_set_irq_affinity(struct hl_device *hdev, int irq); #ifdef CONFIG_DEBUG_FS diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c index e0e5615ef9b0f6..fd01525b1ea204 100644 --- a/drivers/accel/habanalabs/gaudi2/gaudi2.c +++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c @@ -4254,6 +4254,8 @@ static int gaudi2_enable_msix(struct hl_device *hdev) if (gaudi2->hw_cap_initialized & HW_CAP_MSIX) return 0; + hl_init_cpu_for_irq(hdev); + rc = pci_alloc_irq_vectors(hdev->pdev, GAUDI2_MSIX_ENTRIES, GAUDI2_MSIX_ENTRIES, PCI_IRQ_MSIX); if (rc < 0) { @@ -4307,6 +4309,7 @@ static int gaudi2_enable_msix(struct hl_device *hdev) i++, j++, user_irq_init_cnt++) { irq = pci_irq_vector(hdev->pdev, i); + hl_set_irq_affinity(hdev, irq); rc = request_irq(irq, hl_irq_user_interrupt_handler, 0, gaudi2_irq_name(i), &hdev->user_interrupt[j]); if (rc) { @@ -4333,6 +4336,7 @@ static int gaudi2_enable_msix(struct hl_device *hdev) i < GAUDI2_IRQ_NUM_USER_FIRST + user_irq_init_cnt ; i++, j++) { irq = pci_irq_vector(hdev->pdev, i); + irq_set_affinity_hint(irq, NULL); free_irq(irq, &hdev->user_interrupt[j]); } irq = pci_irq_vector(hdev->pdev, GAUDI2_IRQ_NUM_UNEXPECTED_ERROR); @@ -4413,6 +4417,7 @@ static void gaudi2_disable_msix(struct hl_device *hdev) k < hdev->asic_prop.user_interrupt_count ; i++, j++, k++) { irq = pci_irq_vector(hdev->pdev, i); + irq_set_affinity_hint(irq, NULL); free_irq(irq, &hdev->user_interrupt[j]); } From df3d17212018903324c4606743d74ff8b0ba3da3 Mon Sep 17 00:00:00 2001 From: Koby Elbaz Date: Mon, 11 Dec 2023 10:03:29 +0200 Subject: [PATCH 168/707] accel/habanalabs: increase HL_MAX_STR to 64 bytes to avoid warnings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix a warning of a buffer overflow: ‘snprintf’ output between 38 and 47 bytes into a destination of size 32 Signed-off-by: Koby Elbaz Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/accel/habanalabs/common/habanalabs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/accel/habanalabs/common/habanalabs.h b/drivers/accel/habanalabs/common/habanalabs.h index b1a7b229e16160..253873315888e1 100644 --- a/drivers/accel/habanalabs/common/habanalabs.h +++ b/drivers/accel/habanalabs/common/habanalabs.h @@ -2547,7 +2547,7 @@ struct hl_state_dump_specs { * DEVICES */ -#define HL_STR_MAX 32 +#define HL_STR_MAX 64 #define HL_DEV_STS_MAX (HL_DEVICE_STATUS_LAST + 1) From 6101e4be8a6f4afa0d119ca9a0510032193661d5 Mon Sep 17 00:00:00 2001 From: Tomer Tayar Date: Thu, 14 Dec 2023 10:38:06 +0200 Subject: [PATCH 169/707] accel/habanalabs: fix DRAM BAR base address calculation When the DRAM region size in the BAR is not a power of 2, calculating the corresponding BAR base address should be done using the offset from the DRAM start address, and not using directly the DRAM address. Signed-off-by: Tomer Tayar Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/accel/habanalabs/common/device.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/accel/habanalabs/common/device.c b/drivers/accel/habanalabs/common/device.c index 5eacbc73f1bb98..5c46826e365929 100644 --- a/drivers/accel/habanalabs/common/device.c +++ b/drivers/accel/habanalabs/common/device.c @@ -55,7 +55,8 @@ static u64 hl_set_dram_bar(struct hl_device *hdev, u64 addr, struct pci_mem_regi if (is_power_of_2(prop->dram_pci_bar_size)) bar_base_addr = addr & ~(prop->dram_pci_bar_size - 0x1ull); else - bar_base_addr = DIV_ROUND_DOWN_ULL(addr, prop->dram_pci_bar_size) * + bar_base_addr = region->region_base + + div64_u64((addr - region->region_base), prop->dram_pci_bar_size) * prop->dram_pci_bar_size; old_base = hdev->asic_funcs->set_dram_bar_base(hdev, bar_base_addr); From f1282caa974f6a7d6e0860b2d56104da63ae83bd Mon Sep 17 00:00:00 2001 From: Tomer Tayar Date: Mon, 25 Dec 2023 00:28:36 +0200 Subject: [PATCH 170/707] accel/habanalabs: abort device reset for consecutive heartbeat failures The mechanism of aborting device reset for consecutive fatal errors is currently only for fatal errors that are reported by FW. A non-responsive FW and consecutive heartbeat failures is also considered fatal, so add them as well to this mechanism to avoid recurring device reset in such a case. Signed-off-by: Tomer Tayar Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/accel/habanalabs/common/device.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/drivers/accel/habanalabs/common/device.c b/drivers/accel/habanalabs/common/device.c index 5c46826e365929..cf004baf5e6213 100644 --- a/drivers/accel/habanalabs/common/device.c +++ b/drivers/accel/habanalabs/common/device.c @@ -1769,14 +1769,16 @@ int hl_device_reset(struct hl_device *hdev, u32 flags) hdev->device_cpu_disabled = false; hdev->reset_info.hard_reset_pending = false; + /* + * Put the device in an unusable state if there are 2 back to back resets due to + * fatal errors. + */ if (hdev->reset_info.reset_trigger_repeated && - (hdev->reset_info.prev_reset_trigger == - HL_DRV_RESET_FW_FATAL_ERR)) { - /* if there 2 back to back resets from FW, - * ensure driver puts the driver in a unusable state - */ + (hdev->reset_info.prev_reset_trigger == HL_DRV_RESET_FW_FATAL_ERR || + hdev->reset_info.prev_reset_trigger == + HL_DRV_RESET_HEARTBEAT)) { dev_crit(hdev->dev, - "%s Consecutive FW fatal errors received, stopping hard reset\n", + "%s Consecutive fatal errors, stopping hard reset\n", dev_name(&(hdev)->pdev->dev)); rc = -EIO; goto out_err; From d545e7d8763e1f6cd75cc8de22cb414aca914c1c Mon Sep 17 00:00:00 2001 From: Farah Kassabri Date: Thu, 2 Nov 2023 11:53:29 +0200 Subject: [PATCH 171/707] accel/habanalabs/gaudi2: move HMMU page tables to device memory Currently the HMMU page tables reside in the host memory, which will cause host access from the device for every page walk. This can affect PCIe bandwidth in certain scenarios. To prevent that problem, HMMU page tables will be moved to the device memory so the miss transaction will read the hops from there instead of going to the host. Signed-off-by: Farah Kassabri Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/accel/habanalabs/common/habanalabs.h | 26 ++ drivers/accel/habanalabs/common/hw_queue.c | 17 + drivers/accel/habanalabs/common/mmu/Makefile | 2 +- drivers/accel/habanalabs/common/mmu/mmu.c | 223 ++++++++++- drivers/accel/habanalabs/common/mmu/mmu_v1.c | 352 +++--------------- drivers/accel/habanalabs/common/mmu/mmu_v2.c | 338 +++++++++++++++++ drivers/accel/habanalabs/gaudi/gaudi.c | 1 + drivers/accel/habanalabs/gaudi2/gaudi2.c | 245 ++++++++---- drivers/accel/habanalabs/gaudi2/gaudi2P.h | 12 +- .../include/hw_ip/mmu/mmu_general.h | 2 + 10 files changed, 836 insertions(+), 382 deletions(-) create mode 100644 drivers/accel/habanalabs/common/mmu/mmu_v2.c diff --git a/drivers/accel/habanalabs/common/habanalabs.h b/drivers/accel/habanalabs/common/habanalabs.h index 253873315888e1..7397ce86b7f03a 100644 --- a/drivers/accel/habanalabs/common/habanalabs.h +++ b/drivers/accel/habanalabs/common/habanalabs.h @@ -443,18 +443,22 @@ enum hl_collective_mode { * a CB handle can be provided for jobs on this queue. * Otherwise, a CB address must be provided. * @collective_mode: collective mode of current queue + * @q_dram_bd_address: PQ dram address, used when PQ need to reside in DRAM. * @driver_only: true if only the driver is allowed to send a job to this queue, * false otherwise. * @binned: True if the queue is binned out and should not be used * @supports_sync_stream: True if queue supports sync stream + * @dram_bd: True if the bd should be copied to dram, needed for PQ which has been allocated on dram */ struct hw_queue_properties { enum hl_queue_type type; enum queue_cb_alloc_flags cb_alloc_flags; enum hl_collective_mode collective_mode; + u64 q_dram_bd_address; u8 driver_only; u8 binned; u8 supports_sync_stream; + u8 dram_bd; }; /** @@ -1052,6 +1056,8 @@ struct hl_encaps_signals_mgr { * @collective_mode: collective mode of current queue * @kernel_address: holds the queue's kernel virtual address. * @bus_address: holds the queue's DMA address. + * @pq_dram_address: hold the dram address when the PQ is allocated, used when dram_bd is true in + * queue properites. * @pi: holds the queue's pi value. * @ci: holds the queue's ci value, AS CALCULATED BY THE DRIVER (not real ci). * @hw_queue_id: the id of the H/W queue. @@ -1061,6 +1067,7 @@ struct hl_encaps_signals_mgr { * @valid: is the queue valid (we have array of 32 queues, not all of them * exist). * @supports_sync_stream: True if queue supports sync stream + * @dram_bd: True if the bd should be copied to dram, needed for PQ which has been allocated on dram */ struct hl_hw_queue { struct hl_cs_job **shadow_queue; @@ -1069,6 +1076,7 @@ struct hl_hw_queue { enum hl_collective_mode collective_mode; void *kernel_address; dma_addr_t bus_address; + u64 pq_dram_address; u32 pi; atomic_t ci; u32 hw_queue_id; @@ -1077,6 +1085,7 @@ struct hl_hw_queue { u16 int_queue_len; u8 valid; u8 supports_sync_stream; + u8 dram_bd; }; /** @@ -3889,6 +3898,7 @@ int hl_mmu_hr_get_tlb_info(struct hl_ctx *ctx, u64 virt_addr, struct hl_mmu_hop_ struct hl_hr_mmu_funcs *hr_func); int hl_mmu_if_set_funcs(struct hl_device *hdev); void hl_mmu_v1_set_funcs(struct hl_device *hdev, struct hl_mmu_funcs *mmu); +void hl_mmu_v2_set_funcs(struct hl_device *hdev, struct hl_mmu_funcs *mmu); void hl_mmu_v2_hr_set_funcs(struct hl_device *hdev, struct hl_mmu_funcs *mmu); int hl_mmu_va_to_pa(struct hl_ctx *ctx, u64 virt_addr, u64 *phys_addr); int hl_mmu_get_tlb_info(struct hl_ctx *ctx, u64 virt_addr, @@ -3896,6 +3906,22 @@ int hl_mmu_get_tlb_info(struct hl_ctx *ctx, u64 virt_addr, u64 hl_mmu_scramble_addr(struct hl_device *hdev, u64 addr); u64 hl_mmu_descramble_addr(struct hl_device *hdev, u64 addr); bool hl_is_dram_va(struct hl_device *hdev, u64 virt_addr); +struct pgt_info *hl_mmu_dr_get_pgt_info(struct hl_ctx *ctx, u64 hop_addr); +void hl_mmu_dr_free_hop(struct hl_ctx *ctx, u64 hop_addr); +void hl_mmu_dr_free_pgt_node(struct hl_ctx *ctx, struct pgt_info *pgt_info); +u64 hl_mmu_dr_get_phys_hop0_addr(struct hl_ctx *ctx); +u64 hl_mmu_dr_get_hop0_addr(struct hl_ctx *ctx); +void hl_mmu_dr_write_pte(struct hl_ctx *ctx, u64 shadow_pte_addr, u64 val); +void hl_mmu_dr_write_final_pte(struct hl_ctx *ctx, u64 shadow_pte_addr, u64 val); +void hl_mmu_dr_clear_pte(struct hl_ctx *ctx, u64 pte_addr); +u64 hl_mmu_dr_get_phys_addr(struct hl_ctx *ctx, u64 shadow_addr); +void hl_mmu_dr_get_pte(struct hl_ctx *ctx, u64 hop_addr); +int hl_mmu_dr_put_pte(struct hl_ctx *ctx, u64 hop_addr); +u64 hl_mmu_dr_get_alloc_next_hop_addr(struct hl_ctx *ctx, u64 curr_pte, bool *is_new_hop); +u64 hl_mmu_dr_alloc_hop(struct hl_ctx *ctx); +void hl_mmu_dr_flush(struct hl_ctx *ctx); +int hl_mmu_dr_init(struct hl_device *hdev); +void hl_mmu_dr_fini(struct hl_device *hdev); int hl_fw_load_fw_to_device(struct hl_device *hdev, const char *fw_name, void __iomem *dst, u32 src_offset, u32 size); diff --git a/drivers/accel/habanalabs/common/hw_queue.c b/drivers/accel/habanalabs/common/hw_queue.c index d0087c0ec48c9f..3d04a7507cce3c 100644 --- a/drivers/accel/habanalabs/common/hw_queue.c +++ b/drivers/accel/habanalabs/common/hw_queue.c @@ -84,6 +84,8 @@ void hl_hw_queue_submit_bd(struct hl_device *hdev, struct hl_hw_queue *q, u32 ctl, u32 len, u64 ptr) { struct hl_bd *bd; + u64 addr; + int i; bd = q->kernel_address; bd += hl_pi_2_offset(q->pi); @@ -91,7 +93,16 @@ void hl_hw_queue_submit_bd(struct hl_device *hdev, struct hl_hw_queue *q, bd->len = cpu_to_le32(len); bd->ptr = cpu_to_le64(ptr); + if (q->dram_bd) + for (i = 0 ; i < 2 ; i++) { + addr = q->pq_dram_address + + ((hl_pi_2_offset(q->pi) * sizeof(struct hl_bd)) + (i * sizeof(u64))); + hdev->asic_funcs->access_dev_mem(hdev, PCI_REGION_DRAM, addr, + (u64 *)(bd) + i, DEBUGFS_WRITE64); + } + q->pi = hl_queue_inc_ptr(q->pi); + hdev->asic_funcs->ring_doorbell(hdev, q->hw_queue_id, q->pi); } @@ -1087,12 +1098,18 @@ int hl_hw_queues_create(struct hl_device *hdev) q->supports_sync_stream = asic->hw_queues_props[i].supports_sync_stream; q->collective_mode = asic->hw_queues_props[i].collective_mode; + q->dram_bd = asic->hw_queues_props[i].dram_bd; + rc = queue_init(hdev, q, i); if (rc) { dev_err(hdev->dev, "failed to initialize queue %d\n", i); goto release_queues; } + + /* Set DRAM PQ address for the queue if it should be at DRAM */ + if (q->dram_bd) + q->pq_dram_address = asic->hw_queues_props[i].q_dram_bd_address; } return 0; diff --git a/drivers/accel/habanalabs/common/mmu/Makefile b/drivers/accel/habanalabs/common/mmu/Makefile index 1806c524e04aca..f4b815bf4f7d63 100644 --- a/drivers/accel/habanalabs/common/mmu/Makefile +++ b/drivers/accel/habanalabs/common/mmu/Makefile @@ -1,3 +1,3 @@ # SPDX-License-Identifier: GPL-2.0-only HL_COMMON_MMU_FILES := common/mmu/mmu.o common/mmu/mmu_v1.o \ - common/mmu/mmu_v2_hr.o + common/mmu/mmu_v2.o common/mmu/mmu_v2_hr.o diff --git a/drivers/accel/habanalabs/common/mmu/mmu.c b/drivers/accel/habanalabs/common/mmu/mmu.c index b654302a68fc08..fa7919dba783c4 100644 --- a/drivers/accel/habanalabs/common/mmu/mmu.c +++ b/drivers/accel/habanalabs/common/mmu/mmu.c @@ -585,6 +585,8 @@ int hl_mmu_get_tlb_info(struct hl_ctx *ctx, u64 virt_addr, int hl_mmu_if_set_funcs(struct hl_device *hdev) { + struct asic_fixed_properties *prop = &hdev->asic_prop; + if (hdev->mmu_disable) return 0; @@ -597,8 +599,9 @@ int hl_mmu_if_set_funcs(struct hl_device *hdev) case ASIC_GAUDI2: case ASIC_GAUDI2B: case ASIC_GAUDI2C: - /* MMUs in Gaudi2 are always host resident */ - hl_mmu_v2_hr_set_funcs(hdev, &hdev->mmu_func[MMU_HR_PGT]); + hl_mmu_v2_set_funcs(hdev, &hdev->mmu_func[MMU_DR_PGT]); + if (prop->pmmu.host_resident) + hl_mmu_v2_hr_set_funcs(hdev, &hdev->mmu_func[MMU_HR_PGT]); break; default: dev_err(hdev->dev, "Unrecognized ASIC type %d\n", @@ -1209,3 +1212,219 @@ int hl_mmu_hr_get_tlb_info(struct hl_ctx *ctx, u64 virt_addr, struct hl_mmu_hop_ return 0; } +struct pgt_info *hl_mmu_dr_get_pgt_info(struct hl_ctx *ctx, u64 hop_addr) +{ + struct pgt_info *pgt_info = NULL; + + hash_for_each_possible(ctx->mmu_shadow_hash, pgt_info, node, + (unsigned long) hop_addr) + if (hop_addr == pgt_info->shadow_addr) + break; + + return pgt_info; +} + +void hl_mmu_dr_free_hop(struct hl_ctx *ctx, u64 hop_addr) +{ + struct pgt_info *pgt_info = hl_mmu_dr_get_pgt_info(ctx, hop_addr); + + hl_mmu_dr_free_pgt_node(ctx, pgt_info); +} + +void hl_mmu_dr_free_pgt_node(struct hl_ctx *ctx, struct pgt_info *pgt_info) +{ + struct hl_device *hdev = ctx->hdev; + + gen_pool_free(hdev->mmu_priv.dr.mmu_pgt_pool, pgt_info->phys_addr, + hdev->asic_prop.mmu_hop_table_size); + hash_del(&pgt_info->node); + kfree((u64 *) (uintptr_t) pgt_info->shadow_addr); + kfree(pgt_info); +} + +u64 hl_mmu_dr_get_phys_hop0_addr(struct hl_ctx *ctx) +{ + return ctx->hdev->asic_prop.mmu_pgt_addr + + (ctx->asid * ctx->hdev->asic_prop.mmu_hop_table_size); +} + +u64 hl_mmu_dr_get_hop0_addr(struct hl_ctx *ctx) +{ + return (u64) (uintptr_t) ctx->hdev->mmu_priv.dr.mmu_shadow_hop0 + + (ctx->asid * ctx->hdev->asic_prop.mmu_hop_table_size); +} + +u64 hl_mmu_dr_get_phys_addr(struct hl_ctx *ctx, u64 shadow_addr) +{ + u64 page_mask = ctx->hdev->asic_prop.mmu_hop_table_size - 1; + u64 shadow_hop_addr = shadow_addr & (~page_mask); + u64 pte_offset = shadow_addr & page_mask; + u64 phys_hop_addr; + + if (shadow_hop_addr != hl_mmu_dr_get_hop0_addr(ctx)) + phys_hop_addr = hl_mmu_dr_get_pgt_info(ctx, shadow_hop_addr)->phys_addr; + else + phys_hop_addr = hl_mmu_dr_get_phys_hop0_addr(ctx); + + return phys_hop_addr + pte_offset; +} + +void hl_mmu_dr_write_pte(struct hl_ctx *ctx, u64 shadow_pte_addr, u64 val) +{ + u64 phys_val = hl_mmu_dr_get_phys_addr(ctx, val); + + ctx->hdev->asic_funcs->write_pte(ctx->hdev, hl_mmu_dr_get_phys_addr(ctx, shadow_pte_addr), + phys_val); + + *(u64 *) (uintptr_t) shadow_pte_addr = val; +} + +void hl_mmu_dr_write_final_pte(struct hl_ctx *ctx, u64 shadow_pte_addr, u64 val) +{ + ctx->hdev->asic_funcs->write_pte(ctx->hdev, + hl_mmu_dr_get_phys_addr(ctx, shadow_pte_addr), val); + *(u64 *) (uintptr_t) shadow_pte_addr = val; +} + +void hl_mmu_dr_clear_pte(struct hl_ctx *ctx, u64 pte_addr) +{ + hl_mmu_dr_write_final_pte(ctx, pte_addr, 0); +} + +void hl_mmu_dr_get_pte(struct hl_ctx *ctx, u64 hop_addr) +{ + hl_mmu_dr_get_pgt_info(ctx, hop_addr)->num_of_ptes++; +} + +int hl_mmu_dr_put_pte(struct hl_ctx *ctx, u64 hop_addr) +{ + struct pgt_info *pgt_info = hl_mmu_dr_get_pgt_info(ctx, hop_addr); + int num_of_ptes_left; + + pgt_info->num_of_ptes--; + + /* + * Need to save the number of ptes left because hl_mmu_free_hop might free + * the pgt_info + */ + num_of_ptes_left = pgt_info->num_of_ptes; + if (!num_of_ptes_left) + hl_mmu_dr_free_pgt_node(ctx, pgt_info); + + return num_of_ptes_left; +} + +u64 hl_mmu_dr_alloc_hop(struct hl_ctx *ctx) +{ + struct hl_device *hdev = ctx->hdev; + struct asic_fixed_properties *prop = &hdev->asic_prop; + struct pgt_info *pgt_info; + u64 phys_addr, shadow_addr; + + pgt_info = kmalloc(sizeof(*pgt_info), GFP_KERNEL); + if (!pgt_info) + return ULLONG_MAX; + + phys_addr = (u64) gen_pool_alloc(hdev->mmu_priv.dr.mmu_pgt_pool, + prop->mmu_hop_table_size); + if (!phys_addr) { + dev_err(hdev->dev, "failed to allocate page\n"); + goto pool_add_err; + } + + shadow_addr = (u64) (uintptr_t) kzalloc(prop->mmu_hop_table_size, + GFP_KERNEL); + if (!shadow_addr) + goto shadow_err; + + pgt_info->phys_addr = phys_addr; + pgt_info->shadow_addr = shadow_addr; + pgt_info->ctx = ctx; + pgt_info->num_of_ptes = 0; + hash_add(ctx->mmu_shadow_hash, &pgt_info->node, shadow_addr); + + return shadow_addr; + +shadow_err: + gen_pool_free(hdev->mmu_priv.dr.mmu_pgt_pool, + phys_addr, prop->mmu_hop_table_size); +pool_add_err: + kfree(pgt_info); + + return ULLONG_MAX; +} + +u64 hl_mmu_dr_get_alloc_next_hop_addr(struct hl_ctx *ctx, u64 curr_pte, bool *is_new_hop) +{ + u64 hop_addr = hl_mmu_get_next_hop_addr(ctx, curr_pte); + + if (hop_addr == ULLONG_MAX) { + hop_addr = hl_mmu_dr_alloc_hop(ctx); + *is_new_hop = (hop_addr != ULLONG_MAX); + } + + return hop_addr; +} + +void hl_mmu_dr_flush(struct hl_ctx *ctx) +{ + /* flush all writes from all cores to reach PCI */ + mb(); + ctx->hdev->asic_funcs->read_pte(ctx->hdev, hl_mmu_dr_get_phys_hop0_addr(ctx)); +} + +int hl_mmu_dr_init(struct hl_device *hdev) +{ + struct asic_fixed_properties *prop = &hdev->asic_prop; + int rc; + + hdev->mmu_priv.dr.mmu_pgt_pool = + gen_pool_create(__ffs(prop->mmu_hop_table_size), -1); + + if (!hdev->mmu_priv.dr.mmu_pgt_pool) { + dev_err(hdev->dev, "Failed to create page gen pool\n"); + return -ENOMEM; + } + + rc = gen_pool_add(hdev->mmu_priv.dr.mmu_pgt_pool, prop->mmu_pgt_addr + + prop->mmu_hop0_tables_total_size, + prop->dmmu.pgt_size - prop->mmu_hop0_tables_total_size, + -1); + if (rc) { + dev_err(hdev->dev, "Failed to add memory to page gen pool\n"); + goto err_pool_add; + } + + hdev->mmu_priv.dr.mmu_shadow_hop0 = kvcalloc(prop->max_asid, + prop->mmu_hop_table_size, GFP_KERNEL); + if (ZERO_OR_NULL_PTR(hdev->mmu_priv.dr.mmu_shadow_hop0)) { + rc = -ENOMEM; + goto err_pool_add; + } + + /* MMU H/W init will be done in device hw_init() */ + + return 0; + +err_pool_add: + gen_pool_destroy(hdev->mmu_priv.dr.mmu_pgt_pool); + + return rc; +} + +void hl_mmu_dr_fini(struct hl_device *hdev) +{ + /* MMU H/W fini was already done in device hw_fini() */ + + if (ZERO_OR_NULL_PTR(hdev->mmu_priv.dr.mmu_shadow_hop0)) + return; + + kvfree(hdev->mmu_priv.dr.mmu_shadow_hop0); + gen_pool_destroy(hdev->mmu_priv.dr.mmu_pgt_pool); + + /* Make sure that if we arrive here again without init was + * called we won't cause kernel panic. This can happen for + * example if we fail during hard reset code at certain points + */ + hdev->mmu_priv.dr.mmu_shadow_hop0 = NULL; +} diff --git a/drivers/accel/habanalabs/common/mmu/mmu_v1.c b/drivers/accel/habanalabs/common/mmu/mmu_v1.c index d925dc4dd09725..64b5c8fbb166d9 100644 --- a/drivers/accel/habanalabs/common/mmu/mmu_v1.c +++ b/drivers/accel/habanalabs/common/mmu/mmu_v1.c @@ -12,166 +12,6 @@ #define MMU_V1_MAX_HOPS (MMU_HOP4 + 1) -static inline u64 get_phys_addr(struct hl_ctx *ctx, u64 shadow_addr); - -static struct pgt_info *get_pgt_info(struct hl_ctx *ctx, u64 hop_addr) -{ - struct pgt_info *pgt_info = NULL; - - hash_for_each_possible(ctx->mmu_shadow_hash, pgt_info, node, - (unsigned long) hop_addr) - if (hop_addr == pgt_info->shadow_addr) - break; - - return pgt_info; -} - -static void _free_hop(struct hl_ctx *ctx, struct pgt_info *pgt_info) -{ - struct hl_device *hdev = ctx->hdev; - - gen_pool_free(hdev->mmu_priv.dr.mmu_pgt_pool, pgt_info->phys_addr, - hdev->asic_prop.mmu_hop_table_size); - hash_del(&pgt_info->node); - kfree((u64 *) (uintptr_t) pgt_info->shadow_addr); - kfree(pgt_info); -} - -static void free_hop(struct hl_ctx *ctx, u64 hop_addr) -{ - struct pgt_info *pgt_info = get_pgt_info(ctx, hop_addr); - - _free_hop(ctx, pgt_info); -} - -static u64 alloc_hop(struct hl_ctx *ctx) -{ - struct hl_device *hdev = ctx->hdev; - struct asic_fixed_properties *prop = &hdev->asic_prop; - struct pgt_info *pgt_info; - u64 phys_addr, shadow_addr; - - pgt_info = kmalloc(sizeof(*pgt_info), GFP_KERNEL); - if (!pgt_info) - return ULLONG_MAX; - - phys_addr = (u64) gen_pool_alloc(hdev->mmu_priv.dr.mmu_pgt_pool, - prop->mmu_hop_table_size); - if (!phys_addr) { - dev_err(hdev->dev, "failed to allocate page\n"); - goto pool_add_err; - } - - shadow_addr = (u64) (uintptr_t) kzalloc(prop->mmu_hop_table_size, - GFP_KERNEL); - if (!shadow_addr) - goto shadow_err; - - pgt_info->phys_addr = phys_addr; - pgt_info->shadow_addr = shadow_addr; - pgt_info->ctx = ctx; - pgt_info->num_of_ptes = 0; - hash_add(ctx->mmu_shadow_hash, &pgt_info->node, shadow_addr); - - return shadow_addr; - -shadow_err: - gen_pool_free(hdev->mmu_priv.dr.mmu_pgt_pool, phys_addr, - prop->mmu_hop_table_size); -pool_add_err: - kfree(pgt_info); - - return ULLONG_MAX; -} - -static inline u64 get_phys_hop0_addr(struct hl_ctx *ctx) -{ - return ctx->hdev->asic_prop.mmu_pgt_addr + - (ctx->asid * ctx->hdev->asic_prop.mmu_hop_table_size); -} - -static inline u64 get_hop0_addr(struct hl_ctx *ctx) -{ - return (u64) (uintptr_t) ctx->hdev->mmu_priv.dr.mmu_shadow_hop0 + - (ctx->asid * ctx->hdev->asic_prop.mmu_hop_table_size); -} - -static void flush(struct hl_ctx *ctx) -{ - /* flush all writes from all cores to reach PCI */ - mb(); - ctx->hdev->asic_funcs->read_pte(ctx->hdev, get_phys_hop0_addr(ctx)); -} - -/* transform the value to physical address when writing to H/W */ -static inline void write_pte(struct hl_ctx *ctx, u64 shadow_pte_addr, u64 val) -{ - /* - * The value to write is actually the address of the next shadow hop + - * flags at the 12 LSBs. - * Hence in order to get the value to write to the physical PTE, we - * clear the 12 LSBs and translate the shadow hop to its associated - * physical hop, and add back the original 12 LSBs. - */ - u64 phys_val = get_phys_addr(ctx, val & HOP_PHYS_ADDR_MASK) | - (val & FLAGS_MASK); - - ctx->hdev->asic_funcs->write_pte(ctx->hdev, - get_phys_addr(ctx, shadow_pte_addr), - phys_val); - - *(u64 *) (uintptr_t) shadow_pte_addr = val; -} - -/* do not transform the value to physical address when writing to H/W */ -static inline void write_final_pte(struct hl_ctx *ctx, u64 shadow_pte_addr, - u64 val) -{ - ctx->hdev->asic_funcs->write_pte(ctx->hdev, - get_phys_addr(ctx, shadow_pte_addr), - val); - *(u64 *) (uintptr_t) shadow_pte_addr = val; -} - -/* clear the last and present bits */ -static inline void clear_pte(struct hl_ctx *ctx, u64 pte_addr) -{ - /* no need to transform the value to physical address */ - write_final_pte(ctx, pte_addr, 0); -} - -static inline void get_pte(struct hl_ctx *ctx, u64 hop_addr) -{ - get_pgt_info(ctx, hop_addr)->num_of_ptes++; -} - -/* - * put_pte - decrement the num of ptes and free the hop if possible - * - * @ctx: pointer to the context structure - * @hop_addr: addr of the hop - * - * This function returns the number of ptes left on this hop. If the number is - * 0, it means the pte was freed. - */ -static inline int put_pte(struct hl_ctx *ctx, u64 hop_addr) -{ - struct pgt_info *pgt_info = get_pgt_info(ctx, hop_addr); - int num_of_ptes_left; - - pgt_info->num_of_ptes--; - - /* - * Need to save the number of ptes left because free_hop might free - * the pgt_info - */ - num_of_ptes_left = pgt_info->num_of_ptes; - if (!num_of_ptes_left) - _free_hop(ctx, pgt_info); - - return num_of_ptes_left; -} - static inline u64 get_hop_pte_addr(struct hl_ctx *ctx, struct hl_mmu_properties *mmu_prop, u64 *hop_addr_arr, u64 virt_addr, enum mmu_hop_num hop_idx) { @@ -183,35 +23,6 @@ static inline u64 get_hop_pte_addr(struct hl_ctx *ctx, struct hl_mmu_properties ctx->hdev->asic_prop.mmu_pte_size * ((virt_addr & mask) >> shift); } -static inline u64 get_alloc_next_hop_addr(struct hl_ctx *ctx, u64 curr_pte, - bool *is_new_hop) -{ - u64 hop_addr = hl_mmu_get_next_hop_addr(ctx, curr_pte); - - if (hop_addr == ULLONG_MAX) { - hop_addr = alloc_hop(ctx); - *is_new_hop = (hop_addr != ULLONG_MAX); - } - - return hop_addr; -} - -/* translates shadow address inside hop to a physical address */ -static inline u64 get_phys_addr(struct hl_ctx *ctx, u64 shadow_addr) -{ - u64 page_mask = (ctx->hdev->asic_prop.mmu_hop_table_size - 1); - u64 shadow_hop_addr = shadow_addr & ~page_mask; - u64 pte_offset = shadow_addr & page_mask; - u64 phys_hop_addr; - - if (shadow_hop_addr != get_hop0_addr(ctx)) - phys_hop_addr = get_pgt_info(ctx, shadow_hop_addr)->phys_addr; - else - phys_hop_addr = get_phys_hop0_addr(ctx); - - return phys_hop_addr + pte_offset; -} - static int dram_default_mapping_init(struct hl_ctx *ctx) { struct hl_device *hdev = ctx->hdev; @@ -236,9 +47,9 @@ static int dram_default_mapping_init(struct hl_ctx *ctx) if (!ctx->dram_default_hops) return -ENOMEM; - hop0_addr = get_hop0_addr(ctx); + hop0_addr = hl_mmu_dr_get_hop0_addr(ctx); - hop1_addr = alloc_hop(ctx); + hop1_addr = hl_mmu_dr_alloc_hop(ctx); if (hop1_addr == ULLONG_MAX) { dev_err(hdev->dev, "failed to alloc hop 1\n"); rc = -ENOMEM; @@ -247,7 +58,7 @@ static int dram_default_mapping_init(struct hl_ctx *ctx) ctx->dram_default_hops[total_hops - 1] = hop1_addr; - hop2_addr = alloc_hop(ctx); + hop2_addr = hl_mmu_dr_alloc_hop(ctx); if (hop2_addr == ULLONG_MAX) { dev_err(hdev->dev, "failed to alloc hop 2\n"); rc = -ENOMEM; @@ -257,7 +68,7 @@ static int dram_default_mapping_init(struct hl_ctx *ctx) ctx->dram_default_hops[total_hops - 2] = hop2_addr; for (i = 0 ; i < num_of_hop3 ; i++) { - ctx->dram_default_hops[i] = alloc_hop(ctx); + ctx->dram_default_hops[i] = hl_mmu_dr_alloc_hop(ctx); if (ctx->dram_default_hops[i] == ULLONG_MAX) { dev_err(hdev->dev, "failed to alloc hop 3, i: %d\n", i); rc = -ENOMEM; @@ -268,18 +79,18 @@ static int dram_default_mapping_init(struct hl_ctx *ctx) /* need only pte 0 in hops 0 and 1 */ pte_val = (hop1_addr & HOP_PHYS_ADDR_MASK) | PAGE_PRESENT_MASK; - write_pte(ctx, hop0_addr, pte_val); + hl_mmu_dr_write_pte(ctx, hop0_addr, pte_val); pte_val = (hop2_addr & HOP_PHYS_ADDR_MASK) | PAGE_PRESENT_MASK; - write_pte(ctx, hop1_addr, pte_val); - get_pte(ctx, hop1_addr); + hl_mmu_dr_write_pte(ctx, hop1_addr, pte_val); + hl_mmu_dr_get_pte(ctx, hop1_addr); hop2_pte_addr = hop2_addr; for (i = 0 ; i < num_of_hop3 ; i++) { pte_val = (ctx->dram_default_hops[i] & HOP_PHYS_ADDR_MASK) | PAGE_PRESENT_MASK; - write_pte(ctx, hop2_pte_addr, pte_val); - get_pte(ctx, hop2_addr); + hl_mmu_dr_write_pte(ctx, hop2_pte_addr, pte_val); + hl_mmu_dr_get_pte(ctx, hop2_addr); hop2_pte_addr += HL_PTE_SIZE; } @@ -289,23 +100,23 @@ static int dram_default_mapping_init(struct hl_ctx *ctx) for (i = 0 ; i < num_of_hop3 ; i++) { hop3_pte_addr = ctx->dram_default_hops[i]; for (j = 0 ; j < HOP_PTE_ENTRIES_512 ; j++) { - write_final_pte(ctx, hop3_pte_addr, pte_val); - get_pte(ctx, ctx->dram_default_hops[i]); + hl_mmu_dr_write_final_pte(ctx, hop3_pte_addr, pte_val); + hl_mmu_dr_get_pte(ctx, ctx->dram_default_hops[i]); hop3_pte_addr += HL_PTE_SIZE; } } - flush(ctx); + hl_mmu_dr_flush(ctx); return 0; hop3_err: for (i = 0 ; i < hop3_allocated ; i++) - free_hop(ctx, ctx->dram_default_hops[i]); + hl_mmu_dr_free_hop(ctx, ctx->dram_default_hops[i]); - free_hop(ctx, hop2_addr); + hl_mmu_dr_free_hop(ctx, hop2_addr); hop2_err: - free_hop(ctx, hop1_addr); + hl_mmu_dr_free_hop(ctx, hop1_addr); hop1_err: kfree(ctx->dram_default_hops); @@ -329,7 +140,7 @@ static void dram_default_mapping_fini(struct hl_ctx *ctx) do_div(num_of_hop3, prop->dram_page_size); do_div(num_of_hop3, HOP_PTE_ENTRIES_512); - hop0_addr = get_hop0_addr(ctx); + hop0_addr = hl_mmu_dr_get_hop0_addr(ctx); /* add hop1 and hop2 */ total_hops = num_of_hop3 + 2; hop1_addr = ctx->dram_default_hops[total_hops - 1]; @@ -338,101 +149,26 @@ static void dram_default_mapping_fini(struct hl_ctx *ctx) for (i = 0 ; i < num_of_hop3 ; i++) { hop3_pte_addr = ctx->dram_default_hops[i]; for (j = 0 ; j < HOP_PTE_ENTRIES_512 ; j++) { - clear_pte(ctx, hop3_pte_addr); - put_pte(ctx, ctx->dram_default_hops[i]); + hl_mmu_dr_clear_pte(ctx, hop3_pte_addr); + hl_mmu_dr_put_pte(ctx, ctx->dram_default_hops[i]); hop3_pte_addr += HL_PTE_SIZE; } } hop2_pte_addr = hop2_addr; for (i = 0 ; i < num_of_hop3 ; i++) { - clear_pte(ctx, hop2_pte_addr); - put_pte(ctx, hop2_addr); + hl_mmu_dr_clear_pte(ctx, hop2_pte_addr); + hl_mmu_dr_put_pte(ctx, hop2_addr); hop2_pte_addr += HL_PTE_SIZE; } - clear_pte(ctx, hop1_addr); - put_pte(ctx, hop1_addr); - clear_pte(ctx, hop0_addr); + hl_mmu_dr_clear_pte(ctx, hop1_addr); + hl_mmu_dr_put_pte(ctx, hop1_addr); + hl_mmu_dr_clear_pte(ctx, hop0_addr); kfree(ctx->dram_default_hops); - flush(ctx); -} - -/** - * hl_mmu_v1_init() - initialize the MMU module. - * @hdev: habanalabs device structure. - * - * This function does the following: - * - Create a pool of pages for pgt_infos. - * - Create a shadow table for pgt - * - * Return: 0 for success, non-zero for failure. - */ -static int hl_mmu_v1_init(struct hl_device *hdev) -{ - struct asic_fixed_properties *prop = &hdev->asic_prop; - int rc; - - hdev->mmu_priv.dr.mmu_pgt_pool = - gen_pool_create(__ffs(prop->mmu_hop_table_size), -1); - - if (!hdev->mmu_priv.dr.mmu_pgt_pool) { - dev_err(hdev->dev, "Failed to create page gen pool\n"); - return -ENOMEM; - } - - rc = gen_pool_add(hdev->mmu_priv.dr.mmu_pgt_pool, prop->mmu_pgt_addr + - prop->mmu_hop0_tables_total_size, - prop->mmu_pgt_size - prop->mmu_hop0_tables_total_size, - -1); - if (rc) { - dev_err(hdev->dev, "Failed to add memory to page gen pool\n"); - goto err_pool_add; - } - - hdev->mmu_priv.dr.mmu_shadow_hop0 = kvcalloc(prop->max_asid, prop->mmu_hop_table_size, - GFP_KERNEL); - if (ZERO_OR_NULL_PTR(hdev->mmu_priv.dr.mmu_shadow_hop0)) { - rc = -ENOMEM; - goto err_pool_add; - } - - /* MMU H/W init will be done in device hw_init() */ - - return 0; - -err_pool_add: - gen_pool_destroy(hdev->mmu_priv.dr.mmu_pgt_pool); - - return rc; -} - -/** - * hl_mmu_v1_fini() - release the MMU module. - * @hdev: habanalabs device structure. - * - * This function does the following: - * - Disable MMU in H/W. - * - Free the pgt_infos pool. - * - * All contexts should be freed before calling this function. - */ -static void hl_mmu_v1_fini(struct hl_device *hdev) -{ - /* MMU H/W fini was already done in device hw_fini() */ - - if (!ZERO_OR_NULL_PTR(hdev->mmu_priv.dr.mmu_shadow_hop0)) { - kvfree(hdev->mmu_priv.dr.mmu_shadow_hop0); - gen_pool_destroy(hdev->mmu_priv.dr.mmu_pgt_pool); - - /* Make sure that if we arrive here again without init was - * called we won't cause kernel panic. This can happen for - * example if we fail during hard reset code at certain points - */ - hdev->mmu_priv.dr.mmu_shadow_hop0 = NULL; - } + hl_mmu_dr_flush(ctx); } /** @@ -476,7 +212,7 @@ static void hl_mmu_v1_ctx_fini(struct hl_ctx *ctx) dev_err_ratelimited(hdev->dev, "pgt_info of addr 0x%llx of asid %d was not destroyed, num_ptes: %d\n", pgt_info->phys_addr, ctx->asid, pgt_info->num_of_ptes); - _free_hop(ctx, pgt_info); + hl_mmu_dr_free_pgt_node(ctx, pgt_info); } } @@ -495,7 +231,7 @@ static int hl_mmu_v1_unmap(struct hl_ctx *ctx, for (hop_idx = MMU_HOP0; hop_idx < MMU_HOP4; hop_idx++) { if (hop_idx == MMU_HOP0) { - hop_addr[hop_idx] = get_hop0_addr(ctx); + hop_addr[hop_idx] = hl_mmu_dr_get_hop0_addr(ctx); } else { hop_addr[hop_idx] = hl_mmu_get_next_hop_addr(ctx, curr_pte); if (hop_addr[hop_idx] == ULLONG_MAX) @@ -546,30 +282,30 @@ static int hl_mmu_v1_unmap(struct hl_ctx *ctx, } hop_idx = MMU_HOP3; - write_final_pte(ctx, hop_pte_addr[hop_idx], default_pte); - put_pte(ctx, hop_addr[hop_idx]); + hl_mmu_dr_write_final_pte(ctx, hop_pte_addr[hop_idx], default_pte); + hl_mmu_dr_put_pte(ctx, hop_addr[hop_idx]); } else { if (!(curr_pte & PAGE_PRESENT_MASK)) goto not_mapped; if (hop_addr[MMU_HOP4]) - clear_pte(ctx, hop_pte_addr[MMU_HOP4]); + hl_mmu_dr_clear_pte(ctx, hop_pte_addr[MMU_HOP4]); else - clear_pte(ctx, hop_pte_addr[MMU_HOP3]); + hl_mmu_dr_clear_pte(ctx, hop_pte_addr[MMU_HOP3]); - if (hop_addr[MMU_HOP4] && !put_pte(ctx, hop_addr[MMU_HOP4])) + if (hop_addr[MMU_HOP4] && !hl_mmu_dr_put_pte(ctx, hop_addr[MMU_HOP4])) clear_hop3 = true; if (!clear_hop3) goto mapped; for (hop_idx = MMU_HOP3; hop_idx >= 0; hop_idx--) { - clear_pte(ctx, hop_pte_addr[hop_idx]); + hl_mmu_dr_clear_pte(ctx, hop_pte_addr[hop_idx]); if (hop_idx == MMU_HOP0) break; - if (put_pte(ctx, hop_addr[hop_idx])) + if (hl_mmu_dr_put_pte(ctx, hop_addr[hop_idx])) goto mapped; } } @@ -616,10 +352,10 @@ static int hl_mmu_v1_map(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr, for (hop_idx = MMU_HOP0; hop_idx < num_hops; hop_idx++) { if (hop_idx == MMU_HOP0) { - hop_addr[hop_idx] = get_hop0_addr(ctx); + hop_addr[hop_idx] = hl_mmu_dr_get_hop0_addr(ctx); } else { hop_addr[hop_idx] = - get_alloc_next_hop_addr(ctx, curr_pte, &hop_new[hop_idx]); + hl_mmu_dr_get_alloc_next_hop_addr(ctx, curr_pte, &hop_new[hop_idx]); if (hop_addr[hop_idx] == ULLONG_MAX) goto err; } @@ -666,27 +402,27 @@ static int hl_mmu_v1_map(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr, curr_pte = (phys_addr & HOP_PHYS_ADDR_MASK) | mmu_prop->last_mask | PAGE_PRESENT_MASK; - write_final_pte(ctx, hop_pte_addr[num_hops - 1], curr_pte); + hl_mmu_dr_write_final_pte(ctx, hop_pte_addr[num_hops - 1], curr_pte); for (hop_idx = MMU_HOP1; hop_idx < num_hops; hop_idx++) { prev_hop = hop_idx - 1; if (hop_new[hop_idx]) { curr_pte = (hop_addr[hop_idx] & HOP_PHYS_ADDR_MASK) | PAGE_PRESENT_MASK; - write_pte(ctx, hop_pte_addr[prev_hop], curr_pte); + hl_mmu_dr_write_pte(ctx, hop_pte_addr[prev_hop], curr_pte); if (hop_idx != MMU_HOP1) - get_pte(ctx, hop_addr[prev_hop]); + hl_mmu_dr_get_pte(ctx, hop_addr[prev_hop]); } } - get_pte(ctx, hop_addr[num_hops - 1]); + hl_mmu_dr_get_pte(ctx, hop_addr[num_hops - 1]); return 0; err: for (hop_idx = num_hops; hop_idx > MMU_HOP0; hop_idx--) { if (hop_new[hop_idx]) - free_hop(ctx, hop_addr[hop_idx]); + hl_mmu_dr_free_hop(ctx, hop_addr[hop_idx]); } return rc; @@ -752,7 +488,7 @@ static int hl_mmu_v1_get_tlb_info(struct hl_ctx *ctx, u64 virt_addr, if (is_huge) used_hops--; - hops->hop_info[0].hop_addr = get_phys_hop0_addr(ctx); + hops->hop_info[0].hop_addr = hl_mmu_dr_get_phys_hop0_addr(ctx); hops->hop_info[0].hop_pte_addr = hl_mmu_get_hop_pte_phys_addr(ctx, mmu_prop, 0, hops->hop_info[0].hop_addr, virt_addr); @@ -801,13 +537,13 @@ static int hl_mmu_v1_get_tlb_info(struct hl_ctx *ctx, u64 virt_addr, */ void hl_mmu_v1_set_funcs(struct hl_device *hdev, struct hl_mmu_funcs *mmu) { - mmu->init = hl_mmu_v1_init; - mmu->fini = hl_mmu_v1_fini; + mmu->init = hl_mmu_dr_init; + mmu->fini = hl_mmu_dr_fini; mmu->ctx_init = hl_mmu_v1_ctx_init; mmu->ctx_fini = hl_mmu_v1_ctx_fini; mmu->map = hl_mmu_v1_map; mmu->unmap = hl_mmu_v1_unmap; - mmu->flush = flush; + mmu->flush = hl_mmu_dr_flush; mmu->swap_out = hl_mmu_v1_swap_out; mmu->swap_in = hl_mmu_v1_swap_in; mmu->get_tlb_info = hl_mmu_v1_get_tlb_info; diff --git a/drivers/accel/habanalabs/common/mmu/mmu_v2.c b/drivers/accel/habanalabs/common/mmu/mmu_v2.c new file mode 100644 index 00000000000000..4bc0268fff1cf0 --- /dev/null +++ b/drivers/accel/habanalabs/common/mmu/mmu_v2.c @@ -0,0 +1,338 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright 2016-2020 HabanaLabs, Ltd. + * All Rights Reserved. + */ + +#include "../habanalabs.h" +#include "../../include/hw_ip/mmu/mmu_general.h" +#include "../../include/hw_ip/mmu/mmu_v2_0.h" + +#include + +/** + * hl_mmu_v2_ctx_init() - initialize a context for using the MMU module. + * @ctx: pointer to the context structure to initialize. + * + * Initialize a mutex to protect the concurrent mapping flow, a hash to hold all + * page tables hops related to this context. + * Return: 0 on success, non-zero otherwise. + */ +static int hl_mmu_v2_ctx_init(struct hl_ctx *ctx) +{ + hash_init(ctx->mmu_shadow_hash); + + return 0; +} + +/* + * hl_mmu_v2_ctx_fini - disable a ctx from using the mmu module + * + * @ctx: pointer to the context structure + * + * This function does the following: + * - Free any pgts which were not freed yet + * - Free the mutex + * - Free DRAM default page mapping hops + */ +static void hl_mmu_v2_ctx_fini(struct hl_ctx *ctx) +{ + struct hl_device *hdev = ctx->hdev; + struct pgt_info *pgt_info; + struct hlist_node *tmp; + int i; + + if (!hash_empty(ctx->mmu_shadow_hash)) + dev_err(hdev->dev, "ctx %d is freed while it has pgts in use\n", + ctx->asid); + + hash_for_each_safe(ctx->mmu_shadow_hash, i, tmp, pgt_info, node) { + dev_err_ratelimited(hdev->dev, + "pgt_info of addr 0x%llx of asid %d was not destroyed, num_ptes: %d\n", + pgt_info->phys_addr, ctx->asid, pgt_info->num_of_ptes); + hl_mmu_dr_free_pgt_node(ctx, pgt_info); + } +} + +static int hl_mmu_v2_unmap(struct hl_ctx *ctx, u64 virt_addr, bool is_dram_addr) +{ + u64 hop_addr[MMU_ARCH_6_HOPS] = { 0 }, hop_pte_addr[MMU_ARCH_6_HOPS] = { 0 }, curr_pte, + scrambled_virt_addr; + struct asic_fixed_properties *prop = &ctx->hdev->asic_prop; + struct hl_device *hdev = ctx->hdev; + struct hl_mmu_properties *mmu_prop; + bool is_huge = false; + int i, hop_last; + + /* device resident in V2 are allowed only for HMMU */ + if (!is_dram_addr) + return -EINVAL; + + mmu_prop = &prop->dmmu; + + hop_last = mmu_prop->num_hops - 1; + + scrambled_virt_addr = hdev->asic_funcs->scramble_addr(hdev, virt_addr); + + hop_addr[0] = hl_mmu_dr_get_hop0_addr(ctx); + hop_pte_addr[0] = hl_mmu_get_hop_pte_phys_addr(ctx, mmu_prop, 0, + hop_addr[0], scrambled_virt_addr); + if (hop_pte_addr[0] == U64_MAX) + return -EFAULT; + + curr_pte = *(u64 *) (uintptr_t) hop_pte_addr[0]; + + for (i = 1 ; i < mmu_prop->num_hops ; i++) { + hop_addr[i] = hl_mmu_get_next_hop_addr(ctx, curr_pte); + if (hop_addr[i] == ULLONG_MAX) + goto not_mapped; + + hop_pte_addr[i] = hl_mmu_get_hop_pte_phys_addr(ctx, mmu_prop, i, + hop_addr[i], scrambled_virt_addr); + if (hop_pte_addr[i] == U64_MAX) + return -EFAULT; + + curr_pte = *(u64 *) (uintptr_t) hop_pte_addr[i]; + + if ((i <= hop_last) && (curr_pte & mmu_prop->last_mask)) { + hop_last = i; + is_huge = true; + break; + } + } + + if (is_dram_addr && !is_huge) { + dev_err(hdev->dev, "DRAM unmapping should use huge pages only\n"); + return -EFAULT; + } + + if (!(curr_pte & PAGE_PRESENT_MASK)) + goto not_mapped; + + for (i = hop_last ; i > 0 ; i--) { + hl_mmu_dr_clear_pte(ctx, hop_pte_addr[i]); + if (hl_mmu_dr_put_pte(ctx, hop_addr[i])) + goto mapped; + } + hl_mmu_dr_clear_pte(ctx, hop_pte_addr[0]); + +mapped: + return 0; + +not_mapped: + dev_err(hdev->dev, "virt addr 0x%llx is not mapped to phys addr\n", + virt_addr); + + return -EINVAL; +} + +static int hl_mmu_v2_map(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr, + u32 page_size, bool is_dram_addr) +{ + u64 hop_addr[MMU_ARCH_6_HOPS] = { 0 }, hop_pte_addr[MMU_ARCH_6_HOPS] = { 0 }, + curr_pte = 0, scrambled_virt_addr, scrambled_phys_addr; + struct asic_fixed_properties *prop = &ctx->hdev->asic_prop; + bool hop_new[MMU_ARCH_6_HOPS] = { false }; + struct hl_device *hdev = ctx->hdev; + struct hl_mmu_properties *mmu_prop; + int rc, i, hop_last; + + /* device resident in V2 are allowed only for HMMU */ + if (!is_dram_addr) + return -EINVAL; + + mmu_prop = &prop->dmmu; + + hop_last = mmu_prop->num_hops - 1; + + scrambled_virt_addr = hdev->asic_funcs->scramble_addr(hdev, virt_addr); + scrambled_phys_addr = hdev->asic_funcs->scramble_addr(hdev, phys_addr); + + /* First hop is preallocated therefore it is treated differently */ + hop_addr[0] = hl_mmu_dr_get_hop0_addr(ctx); + hop_pte_addr[0] = hl_mmu_get_hop_pte_phys_addr(ctx, mmu_prop, 0, + hop_addr[0], scrambled_virt_addr); + curr_pte = *(u64 *) (uintptr_t) hop_pte_addr[0]; + + /* Handle hop1 to hop_last */ + for (i = 1 ; i <= hop_last ; i++) { + hop_addr[i] = hl_mmu_dr_get_alloc_next_hop_addr(ctx, curr_pte, &hop_new[i]); + if (hop_addr[i] == ULLONG_MAX) { + rc = -ENOMEM; + goto err; + } + + hop_pte_addr[i] = hl_mmu_get_hop_pte_phys_addr(ctx, mmu_prop, i, + hop_addr[i], scrambled_virt_addr); + if (hop_pte_addr[i] == U64_MAX) { + rc = -EINVAL; + goto err; + } + + if (!hop_pte_addr[i]) { + rc = -EINVAL; + goto err; + } + + curr_pte = *(u64 *) (uintptr_t) hop_pte_addr[i]; + } + + if (curr_pte & PAGE_PRESENT_MASK) { + dev_err(hdev->dev, + "mapping already exists for virt_addr 0x%llx\n", + virt_addr); + + for (i = 0 ; i <= hop_last ; i++) + dev_dbg(hdev->dev, "hop%d pte: 0x%llx (0x%llx)\n", + i, *(u64 *) (uintptr_t) hop_pte_addr[i], + hop_pte_addr[i]); + + rc = -EINVAL; + goto err; + } + + curr_pte = (scrambled_phys_addr & HOP_PHYS_ADDR_MASK) + | mmu_prop->last_mask | PAGE_PRESENT_MASK; + + /* Write the PTEs */ + hl_mmu_dr_write_final_pte(ctx, hop_pte_addr[hop_last], curr_pte); + + /* for each new hop, add its address to the table of previous-hop */ + for (i = 1 ; i <= hop_last ; i++) { + if (hop_new[i]) { + curr_pte = (hop_addr[i] & HOP_PHYS_ADDR_MASK) | PAGE_PRESENT_MASK; + hl_mmu_dr_write_pte(ctx, hop_pte_addr[i - 1], curr_pte); + + if (i - 1) + hl_mmu_dr_get_pte(ctx, hop_addr[i - 1]); + } + } + hl_mmu_dr_get_pte(ctx, hop_addr[hop_last]); + + return 0; + +err: + for (i = 1 ; i <= hop_last ; i++) + if (hop_new[i] && (hop_addr[i] != U64_MAX)) + hl_mmu_dr_free_hop(ctx, hop_addr[i]); + + return rc; +} + +/* + * hl_mmu_v2_swap_out - marks all mapping of the given ctx as swapped out + * + * @ctx: pointer to the context structure + * + */ +static void hl_mmu_v2_swap_out(struct hl_ctx *ctx) +{ + +} + +/* + * hl_mmu_v2_swap_in - marks all mapping of the given ctx as swapped in + * + * @ctx: pointer to the context structure + * + */ +static void hl_mmu_v2_swap_in(struct hl_ctx *ctx) +{ + +} + +static int hl_mmu_v2_get_tlb_info(struct hl_ctx *ctx, u64 virt_addr, struct hl_mmu_hop_info *hops) +{ + struct asic_fixed_properties *prop = &ctx->hdev->asic_prop; + struct hl_device *hdev = ctx->hdev; + struct hl_mmu_properties *mmu_prop; + bool is_dram_addr; + int i; + + is_dram_addr = hl_mem_area_inside_range(virt_addr, prop->dmmu.page_size, + prop->dmmu.start_addr, + prop->dmmu.end_addr); + + /* device resident in V2 are allowed only for HMMU */ + if (!is_dram_addr) + return -EINVAL; + + mmu_prop = &prop->dmmu; + hops->range_type = HL_VA_RANGE_TYPE_DRAM; + + hops->scrambled_vaddr = hdev->asic_funcs->scramble_addr(hdev, virt_addr); + + hops->hop_info[0].hop_addr = hl_mmu_dr_get_phys_hop0_addr(ctx); + hops->hop_info[0].hop_pte_addr = hl_mmu_get_hop_pte_phys_addr(ctx, mmu_prop, 0, + hops->hop_info[0].hop_addr, + hops->scrambled_vaddr); + if (hops->hop_info[0].hop_pte_addr == U64_MAX) + return -EFAULT; + + hops->hop_info[0].hop_pte_val = hdev->asic_funcs->read_pte(hdev, + hops->hop_info[0].hop_pte_addr); + if (hops->hop_info[0].hop_pte_val == U64_MAX) + return -EFAULT; + + for (i = 1 ; i < mmu_prop->num_hops ; i++) { + hops->hop_info[i].hop_addr = + hl_mmu_get_next_hop_addr(ctx, hops->hop_info[i - 1].hop_pte_val); + if (hops->hop_info[i].hop_addr == ULLONG_MAX) + return -EFAULT; + + hops->hop_info[i].hop_pte_addr = + hl_mmu_get_hop_pte_phys_addr(ctx, mmu_prop, i, + hops->hop_info[i].hop_addr, + hops->scrambled_vaddr); + if (hops->hop_info[i].hop_pte_addr == U64_MAX) + return -EFAULT; + + hops->hop_info[i].hop_pte_val = + hdev->asic_funcs->read_pte(hdev, + hops->hop_info[i].hop_pte_addr); + + if (!(hops->hop_info[i].hop_pte_val & PAGE_PRESENT_MASK)) + return -EFAULT; + + if (hops->hop_info[i].hop_pte_val & mmu_prop->last_mask) + break; + } + + /* if passed over all hops then no last hop was found */ + if (i == mmu_prop->num_hops) + return -EFAULT; + + if (!(hops->hop_info[i].hop_pte_val & PAGE_PRESENT_MASK)) + return -EFAULT; + + if (hops->scrambled_vaddr != virt_addr) + hops->unscrambled_paddr = hdev->asic_funcs->descramble_addr + (hdev, hops->hop_info[i].hop_pte_val); + else + hops->unscrambled_paddr = hops->hop_info[i].hop_pte_val; + + hops->used_hops = i + 1; + + return 0; +} + +/* + * hl_mmu_v2_prepare - prepare mmu_if for working with mmu v2 + * + * @hdev: pointer to the device structure + * @mmu_if: pointer to the mmu interface structure + */ +void hl_mmu_v2_set_funcs(struct hl_device *hdev, struct hl_mmu_funcs *mmu) +{ + mmu->init = hl_mmu_dr_init; + mmu->fini = hl_mmu_dr_fini; + mmu->ctx_init = hl_mmu_v2_ctx_init; + mmu->ctx_fini = hl_mmu_v2_ctx_fini; + mmu->map = hl_mmu_v2_map; + mmu->unmap = hl_mmu_v2_unmap; + mmu->flush = hl_mmu_dr_flush; + mmu->swap_out = hl_mmu_v2_swap_out; + mmu->swap_in = hl_mmu_v2_swap_in; + mmu->get_tlb_info = hl_mmu_v2_get_tlb_info; +} diff --git a/drivers/accel/habanalabs/gaudi/gaudi.c b/drivers/accel/habanalabs/gaudi/gaudi.c index 53292d4c15c865..dde3839fe0e070 100644 --- a/drivers/accel/habanalabs/gaudi/gaudi.c +++ b/drivers/accel/habanalabs/gaudi/gaudi.c @@ -649,6 +649,7 @@ static int gaudi_set_fixed_properties(struct hl_device *hdev) prop->dmmu.start_addr = (VA_HOST_SPACE_START + VA_HOST_SPACE_SIZE / 2); prop->dmmu.end_addr = VA_HOST_SPACE_END; prop->dmmu.page_size = PAGE_SIZE_2MB; + prop->dmmu.pgt_size = prop->mmu_pgt_size; prop->cfg_size = CFG_SIZE; prop->max_asid = MAX_ASID; diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c index fd01525b1ea204..5863c904913433 100644 --- a/drivers/accel/habanalabs/gaudi2/gaudi2.c +++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c @@ -2308,11 +2308,26 @@ static int set_number_of_functional_hbms(struct hl_device *hdev) return 0; } +static bool gaudi2_is_edma_queue_id(u32 queue_id) +{ + + switch (queue_id) { + case GAUDI2_QUEUE_ID_DCORE0_EDMA_0_0...GAUDI2_QUEUE_ID_DCORE0_EDMA_1_3: + case GAUDI2_QUEUE_ID_DCORE1_EDMA_0_0...GAUDI2_QUEUE_ID_DCORE1_EDMA_1_3: + case GAUDI2_QUEUE_ID_DCORE2_EDMA_0_0...GAUDI2_QUEUE_ID_DCORE2_EDMA_1_3: + case GAUDI2_QUEUE_ID_DCORE3_EDMA_0_0...GAUDI2_QUEUE_ID_DCORE3_EDMA_1_3: + return true; + default: + return false; + } +} + static int gaudi2_set_dram_properties(struct hl_device *hdev) { struct asic_fixed_properties *prop = &hdev->asic_prop; - u32 basic_hbm_page_size; - int rc; + u64 hbm_drv_base_offset = 0, edma_pq_base_addr; + u32 basic_hbm_page_size, edma_idx = 0; + int rc, i; rc = set_number_of_functional_hbms(hdev); if (rc) @@ -2356,9 +2371,35 @@ static int gaudi2_set_dram_properties(struct hl_device *hdev) prop->dmmu.start_addr = prop->dram_base_address + (prop->dram_page_size * DIV_ROUND_UP_SECTOR_T(prop->dram_size, prop->dram_page_size)); - prop->dmmu.end_addr = prop->dmmu.start_addr + prop->dram_page_size * div_u64((VA_HBM_SPACE_END - prop->dmmu.start_addr), prop->dmmu.page_size); + /* + * Driver can't share an (48MB) HBM page with the F/W in order to prevent FW to block + * the driver part by range register, so it must start at the next (48MB) page + */ + hbm_drv_base_offset = roundup(CPU_FW_IMAGE_SIZE, prop->num_functional_hbms * SZ_8M); + + /* + * The NIC driver section size and the HMMU page tables section in the HBM needs + * to be the remaining size in the first dram page after taking into + * account the F/W image size + */ + + /* Reserve region in HBM for HMMU page tables */ + prop->mmu_pgt_addr = DRAM_PHYS_BASE + hbm_drv_base_offset + + ((prop->dram_page_size - hbm_drv_base_offset) - + (HMMU_PAGE_TABLES_SIZE + EDMA_PQS_SIZE + EDMA_SCRATCHPAD_SIZE)); + + /* Set EDMA PQs HBM addresses */ + edma_pq_base_addr = prop->mmu_pgt_addr + HMMU_PAGE_TABLES_SIZE; + + for (i = 0 ; i < GAUDI2_QUEUE_ID_CPU_PQ ; i++) { + if (gaudi2_is_edma_queue_id(i)) { + prop->hw_queues_props[i].q_dram_bd_address = edma_pq_base_addr + + (edma_idx * HL_QUEUE_SIZE_IN_BYTES); + edma_idx++; + } + } return 0; } @@ -2368,7 +2409,7 @@ static int gaudi2_set_fixed_properties(struct hl_device *hdev) struct asic_fixed_properties *prop = &hdev->asic_prop; struct hw_queue_properties *q_props; u32 num_sync_stream_queues = 0; - int i; + int i, rc; prop->max_queues = GAUDI2_QUEUE_ID_SIZE; prop->hw_queues_props = kcalloc(prop->max_queues, sizeof(struct hw_queue_properties), @@ -2391,6 +2432,9 @@ static int gaudi2_set_fixed_properties(struct hl_device *hdev) } q_props[i].cb_alloc_flags = CB_ALLOC_USER; + + if (gaudi2_is_edma_queue_id(i)) + q_props[i].dram_bd = 1; } q_props[GAUDI2_QUEUE_ID_CPU_PQ].type = QUEUE_TYPE_CPU; @@ -2419,40 +2463,39 @@ static int gaudi2_set_fixed_properties(struct hl_device *hdev) prop->rotator_enabled_mask = BIT(NUM_OF_ROT) - 1; - if (hdev->pldm) - prop->mmu_pgt_size = 0x800000; /* 8MB */ - else - prop->mmu_pgt_size = MMU_PAGE_TABLES_INITIAL_SIZE; + prop->max_asid = 2; + prop->dmmu.pgt_size = HMMU_PAGE_TABLES_SIZE; prop->mmu_pte_size = HL_PTE_SIZE; prop->mmu_hop_table_size = HOP_TABLE_SIZE_512_PTE; - prop->mmu_hop0_tables_total_size = HOP0_512_PTE_TABLES_TOTAL_SIZE; + prop->mmu_hop0_tables_total_size = HOP_TABLE_SIZE_512_PTE * prop->max_asid; prop->dmmu.hop_shifts[MMU_HOP0] = DHOP0_SHIFT; prop->dmmu.hop_shifts[MMU_HOP1] = DHOP1_SHIFT; prop->dmmu.hop_shifts[MMU_HOP2] = DHOP2_SHIFT; prop->dmmu.hop_shifts[MMU_HOP3] = DHOP3_SHIFT; - prop->dmmu.hop_shifts[MMU_HOP4] = DHOP4_SHIFT; prop->dmmu.hop_masks[MMU_HOP0] = DHOP0_MASK; prop->dmmu.hop_masks[MMU_HOP1] = DHOP1_MASK; prop->dmmu.hop_masks[MMU_HOP2] = DHOP2_MASK; prop->dmmu.hop_masks[MMU_HOP3] = DHOP3_MASK; - prop->dmmu.hop_masks[MMU_HOP4] = DHOP4_MASK; prop->dmmu.page_size = PAGE_SIZE_1GB; - prop->dmmu.num_hops = MMU_ARCH_6_HOPS; + prop->dmmu.num_hops = MMU_ARCH_4_HOPS; prop->dmmu.last_mask = LAST_MASK; - prop->dmmu.host_resident = 1; + prop->dmmu.host_resident = 0; prop->dmmu.hop_table_size = prop->mmu_hop_table_size; prop->dmmu.hop0_tables_total_size = prop->mmu_hop0_tables_total_size; - /* - * this is done in order to be able to validate FW descriptor (i.e. validating that - * the addresses and allocated space for FW image does not cross memory bounds). - * for this reason we set the DRAM size to the minimum possible and later it will - * be modified according to what reported in the cpucp info packet + /* As we need to set the pgt address in dram for HMMU init so we cannot + * wait to the fw cpucp info to set the dram props as mmu init comes before + * hw init */ - prop->dram_size = (GAUDI2_HBM_NUM - 1) * SZ_16G; + rc = hdev->asic_funcs->set_dram_properties(hdev); + if (rc) + goto free_qprops; + + prop->mmu_pgt_size = PMMU_PAGE_TABLES_SIZE; + prop->pmmu.pgt_size = prop->mmu_pgt_size; hdev->pmmu_huge_range = true; prop->pmmu.host_resident = 1; prop->pmmu.num_hops = MMU_ARCH_6_HOPS; @@ -2516,7 +2559,6 @@ static int gaudi2_set_fixed_properties(struct hl_device *hdev) prop->max_num_of_engines = GAUDI2_ENGINE_ID_SIZE; prop->num_engine_cores = CPU_ID_MAX; prop->cfg_size = CFG_SIZE; - prop->max_asid = MAX_ASID; prop->num_of_events = GAUDI2_EVENT_SIZE; prop->supports_engine_modes = true; @@ -2560,6 +2602,10 @@ static int gaudi2_set_fixed_properties(struct hl_device *hdev) prop->hbw_flush_reg = mmPCIE_WRAP_SPECIAL_GLBL_SPARE_0; return 0; + +free_qprops: + kfree(prop->hw_queues_props); + return rc; } static int gaudi2_pci_bars_map(struct hl_device *hdev) @@ -3033,6 +3079,25 @@ static int gaudi2_fetch_psoc_frequency(struct hl_device *hdev) return 0; } +static int gaudi2_mmu_clear_pgt_range(struct hl_device *hdev) +{ + struct gaudi2_device *gaudi2 = hdev->asic_specific; + struct asic_fixed_properties *prop = &hdev->asic_prop; + int rc; + + if (!(gaudi2->hw_cap_initialized & HW_CAP_MMU_MASK)) + return 0; + + if (prop->dmmu.host_resident) + return 0; + + rc = gaudi2_memset_device_memory(hdev, prop->mmu_pgt_addr, prop->dmmu.pgt_size, 0); + if (rc) + dev_err(hdev->dev, "Failed to clear mmu pgt"); + + return rc; +} + static int gaudi2_early_init(struct hl_device *hdev) { struct asic_fixed_properties *prop = &hdev->asic_prop; @@ -3258,6 +3323,12 @@ static int gaudi2_late_init(struct hl_device *hdev) goto disable_pci_access; } + rc = gaudi2_mmu_clear_pgt_range(hdev); + if (rc) { + dev_err(hdev->dev, "Failed to clear MMU page tables range\n"); + goto disable_pci_access; + } + gaudi2_init_arcs(hdev); rc = gaudi2_scrub_arcs_dccm(hdev); @@ -3697,13 +3768,7 @@ static int gaudi2_sw_init(struct hl_device *hdev) spin_lock_init(&gaudi2->hw_queues_lock); - gaudi2->scratchpad_kernel_address = hl_asic_dma_alloc_coherent(hdev, PAGE_SIZE, - &gaudi2->scratchpad_bus_address, - GFP_KERNEL | __GFP_ZERO); - if (!gaudi2->scratchpad_kernel_address) { - rc = -ENOMEM; - goto free_virt_msix_db_mem; - } + gaudi2->scratchpad_bus_address = prop->mmu_pgt_addr + HMMU_PAGE_TABLES_SIZE + EDMA_PQS_SIZE; gaudi2_user_mapped_blocks_init(hdev); @@ -3727,7 +3792,7 @@ static int gaudi2_sw_init(struct hl_device *hdev) rc = gaudi2_special_blocks_iterator_config(hdev); if (rc) - goto free_scratchpad_mem; + goto free_virt_msix_db_mem; rc = gaudi2_test_queues_msgs_alloc(hdev); if (rc) @@ -3737,9 +3802,6 @@ static int gaudi2_sw_init(struct hl_device *hdev) special_blocks_free: gaudi2_special_blocks_iterator_free(hdev); -free_scratchpad_mem: - hl_asic_dma_free_coherent(hdev, PAGE_SIZE, gaudi2->scratchpad_kernel_address, - gaudi2->scratchpad_bus_address); free_virt_msix_db_mem: hl_cpu_accessible_dma_pool_free(hdev, prop->pmmu.page_size, gaudi2->virt_msix_db_cpu_addr); free_cpu_accessible_dma_pool: @@ -3770,9 +3832,6 @@ static int gaudi2_sw_fini(struct hl_device *hdev) hl_asic_dma_free_coherent(hdev, HL_CPU_ACCESSIBLE_MEM_SIZE, hdev->cpu_accessible_dma_mem, hdev->cpu_accessible_dma_address); - hl_asic_dma_free_coherent(hdev, PAGE_SIZE, gaudi2->scratchpad_kernel_address, - gaudi2->scratchpad_bus_address); - dma_pool_destroy(hdev->dma_pool); kfree(gaudi2); @@ -4962,10 +5021,17 @@ static void gaudi2_init_qman_pq(struct hl_device *hdev, u32 reg_base, q = &hdev->kernel_queues[queue_id_base + pq_id]; pq_offset = pq_id * 4; - WREG32(reg_base + QM_PQ_BASE_LO_0_OFFSET + pq_offset, - lower_32_bits(q->bus_address)); - WREG32(reg_base + QM_PQ_BASE_HI_0_OFFSET + pq_offset, - upper_32_bits(q->bus_address)); + if (q->dram_bd) { + WREG32(reg_base + QM_PQ_BASE_LO_0_OFFSET + pq_offset, + lower_32_bits(q->pq_dram_address)); + WREG32(reg_base + QM_PQ_BASE_HI_0_OFFSET + pq_offset, + upper_32_bits(q->pq_dram_address)); + } else { + WREG32(reg_base + QM_PQ_BASE_LO_0_OFFSET + pq_offset, + lower_32_bits(q->bus_address)); + WREG32(reg_base + QM_PQ_BASE_HI_0_OFFSET + pq_offset, + upper_32_bits(q->bus_address)); + } WREG32(reg_base + QM_PQ_SIZE_0_OFFSET + pq_offset, ilog2(HL_QUEUE_LENGTH)); WREG32(reg_base + QM_PQ_PI_0_OFFSET + pq_offset, 0); WREG32(reg_base + QM_PQ_CI_0_OFFSET + pq_offset, 0); @@ -5852,7 +5918,8 @@ static int gaudi2_mmu_invalidate_cache_range(struct hl_device *hdev, bool is_har return rc; } -static int gaudi2_mmu_update_hop0_addr(struct hl_device *hdev, u32 stlb_base) +static int gaudi2_mmu_update_hop0_addr(struct hl_device *hdev, u32 stlb_base, + bool host_resident_pgt) { struct asic_fixed_properties *prop = &hdev->asic_prop; u64 hop0_addr; @@ -5864,7 +5931,11 @@ static int gaudi2_mmu_update_hop0_addr(struct hl_device *hdev, u32 stlb_base) max_asid = min((u32) 8, max_asid); for (asid = 0 ; asid < max_asid ; asid++) { - hop0_addr = hdev->mmu_priv.hr.mmu_asid_hop0[asid].phys_addr; + if (host_resident_pgt) + hop0_addr = hdev->mmu_priv.hr.mmu_asid_hop0[asid].phys_addr; + else + hop0_addr = prop->mmu_pgt_addr + (asid * prop->mmu_hop_table_size); + rc = gaudi2_mmu_update_asid_hop0_addr(hdev, stlb_base, asid, hop0_addr); if (rc) { dev_err(hdev->dev, "failed to set hop0 addr for asid %d\n", asid); @@ -5875,7 +5946,8 @@ static int gaudi2_mmu_update_hop0_addr(struct hl_device *hdev, u32 stlb_base) return 0; } -static int gaudi2_mmu_init_common(struct hl_device *hdev, u32 mmu_base, u32 stlb_base) +static int gaudi2_mmu_init_common(struct hl_device *hdev, u32 mmu_base, u32 stlb_base, + bool host_resident_pgt) { u32 status, timeout_usec; int rc; @@ -5898,7 +5970,7 @@ static int gaudi2_mmu_init_common(struct hl_device *hdev, u32 mmu_base, u32 stlb if (rc) dev_notice_ratelimited(hdev->dev, "Timeout when waiting for MMU SRAM init\n"); - rc = gaudi2_mmu_update_hop0_addr(hdev, stlb_base); + rc = gaudi2_mmu_update_hop0_addr(hdev, stlb_base, host_resident_pgt); if (rc) return rc; @@ -5922,6 +5994,7 @@ static int gaudi2_mmu_init_common(struct hl_device *hdev, u32 mmu_base, u32 stlb static int gaudi2_pci_mmu_init(struct hl_device *hdev) { + struct asic_fixed_properties *prop = &hdev->asic_prop; struct gaudi2_device *gaudi2 = hdev->asic_specific; u32 mmu_base, stlb_base; int rc; @@ -5961,7 +6034,7 @@ static int gaudi2_pci_mmu_init(struct hl_device *hdev) WREG32(mmu_base + MMU_SPI_SEI_MASK_OFFSET, GAUDI2_PMMU_SPI_SEI_ENABLE_MASK); - rc = gaudi2_mmu_init_common(hdev, mmu_base, stlb_base); + rc = gaudi2_mmu_init_common(hdev, mmu_base, stlb_base, prop->pmmu.host_resident); if (rc) return rc; @@ -6013,7 +6086,7 @@ static int gaudi2_dcore_hmmu_init(struct hl_device *hdev, int dcore_id, WREG32(mmu_base + MMU_SPI_SEI_MASK_OFFSET, GAUDI2_HMMU_SPI_SEI_ENABLE_MASK); - rc = gaudi2_mmu_init_common(hdev, mmu_base, stlb_base); + rc = gaudi2_mmu_init_common(hdev, mmu_base, stlb_base, prop->dmmu.host_resident); if (rc) return rc; @@ -7051,7 +7124,7 @@ static int gaudi2_test_queues(struct hl_device *hdev) /* send test message on all enabled Qs */ for (i = GAUDI2_QUEUE_ID_PDMA_0_0 ; i < GAUDI2_QUEUE_ID_CPU_PQ; i++) { - if (!gaudi2_is_queue_enabled(hdev, i)) + if (!gaudi2_is_queue_enabled(hdev, i) || gaudi2_is_edma_queue_id(i)) continue; msg_info = &gaudi2->queues_test_info[i - GAUDI2_QUEUE_ID_PDMA_0_0]; @@ -7068,7 +7141,7 @@ static int gaudi2_test_queues(struct hl_device *hdev) /* verify that all messages were processed */ for (i = GAUDI2_QUEUE_ID_PDMA_0_0 ; i < GAUDI2_QUEUE_ID_CPU_PQ; i++) { - if (!gaudi2_is_queue_enabled(hdev, i)) + if (!gaudi2_is_queue_enabled(hdev, i) || gaudi2_is_edma_queue_id(i)) continue; rc = gaudi2_test_queue_wait_completion(hdev, i, sob_val); @@ -8988,7 +9061,6 @@ static void gaudi2_handle_page_error(struct hl_device *hdev, u64 mmu_base, bool if (is_pmmu) { dev_err_ratelimited(hdev->dev, "PMMU page fault on va 0x%llx\n", addr); } else { - addr = gaudi2_mmu_descramble_addr(hdev, addr); addr &= HW_UNSCRAMBLED_BITS_MASK; dev_err_ratelimited(hdev->dev, "HMMU page fault on va range 0x%llx - 0x%llx\n", @@ -10255,11 +10327,11 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent } static int gaudi2_memset_memory_chunk_using_edma_qm(struct hl_device *hdev, - struct packet_lin_dma *lin_dma_pkt, dma_addr_t pkt_dma_addr, - u32 hw_queue_id, u32 size, u64 addr, u32 val) + struct packet_lin_dma *lin_dma_pkt, + u64 phys_addr, u32 hw_queue_id, u32 size, u64 addr, u32 val) { u32 ctl, pkt_size; - int rc = 0; + int rc = 0, i; ctl = FIELD_PREP(GAUDI2_PKT_CTL_OPCODE_MASK, PACKET_LIN_DMA); ctl |= FIELD_PREP(GAUDI2_PKT_LIN_DMA_CTL_MEMSET_MASK, 1); @@ -10273,7 +10345,12 @@ static int gaudi2_memset_memory_chunk_using_edma_qm(struct hl_device *hdev, pkt_size = sizeof(struct packet_lin_dma); - rc = hl_hw_queue_send_cb_no_cmpl(hdev, hw_queue_id, pkt_size, pkt_dma_addr); + for (i = 0; i < 3; i++) + rc = hdev->asic_funcs->access_dev_mem(hdev, PCI_REGION_DRAM, + phys_addr + (i * sizeof(u64)), + ((u64 *)(lin_dma_pkt)) + i, DEBUGFS_WRITE64); + + rc = hl_hw_queue_send_cb_no_cmpl(hdev, hw_queue_id, pkt_size, phys_addr); if (rc) dev_err(hdev->dev, "Failed to send lin dma packet to H/W queue %d\n", hw_queue_id); @@ -10288,12 +10365,11 @@ static int gaudi2_memset_device_memory(struct hl_device *hdev, u64 addr, u64 siz GAUDI2_QUEUE_ID_DCORE2_EDMA_0_0, GAUDI2_QUEUE_ID_DCORE3_EDMA_0_0}; u32 chunk_size, dcore, edma_idx, sob_offset, sob_addr, comp_val, - old_mmubp, mmubp, num_of_pkts, busy, pkt_size; + old_mmubp, mmubp, num_of_pkts, busy, pkt_size, cb_len; u64 comp_addr, cur_addr = addr, end_addr = addr + size; struct asic_fixed_properties *prop = &hdev->asic_prop; + int rc = 0, dma_num = 0, i; void *lin_dma_pkts_arr; - dma_addr_t pkt_dma_addr; - int rc = 0, dma_num = 0; if (prop->edma_enabled_mask == 0) { dev_info(hdev->dev, "non of the EDMA engines is enabled - skip dram scrubbing\n"); @@ -10311,9 +10387,19 @@ static int gaudi2_memset_device_memory(struct hl_device *hdev, u64 addr, u64 siz /* Calculate how many lin dma pkts we'll need */ num_of_pkts = div64_u64(round_up(size, SZ_2G), SZ_2G); pkt_size = sizeof(struct packet_lin_dma); + cb_len = pkt_size * num_of_pkts; + + /* + * if we're not scrubing HMMU or NIC reserved sections in hbm, + * then it the scrubing of the user section, as we use the start of the user section + * to store the CB of the EDMA QM, so shift the start address of the scrubbing accordingly + * and scrub the CB section before leaving this function. + */ + if ((addr >= prop->dram_user_base_address) && + (addr < prop->dram_user_base_address + cb_len)) + cur_addr += (prop->dram_user_base_address + cb_len) - addr; - lin_dma_pkts_arr = hl_asic_dma_alloc_coherent(hdev, pkt_size * num_of_pkts, - &pkt_dma_addr, GFP_KERNEL); + lin_dma_pkts_arr = kvcalloc(num_of_pkts, pkt_size, GFP_KERNEL); if (!lin_dma_pkts_arr) return -ENOMEM; @@ -10359,7 +10445,7 @@ static int gaudi2_memset_device_memory(struct hl_device *hdev, u64 addr, u64 siz rc = gaudi2_memset_memory_chunk_using_edma_qm(hdev, (struct packet_lin_dma *)lin_dma_pkts_arr + dma_num, - pkt_dma_addr + dma_num * pkt_size, + prop->dram_user_base_address + (dma_num * pkt_size), edma_queues_id[dcore] + edma_idx * 4, chunk_size, cur_addr, val); if (rc) @@ -10368,14 +10454,16 @@ static int gaudi2_memset_device_memory(struct hl_device *hdev, u64 addr, u64 siz dma_num++; cur_addr += chunk_size; if (cur_addr == end_addr) - break; + goto edma_wait; } } } +edma_wait: rc = hl_poll_timeout(hdev, sob_addr, busy, (busy == dma_num), 1000, 1000000); if (rc) { - dev_err(hdev->dev, "DMA Timeout during HBM scrubbing\n"); + dev_err(hdev->dev, "DMA Timeout during HBM scrubbing(sob: 0x%x, dma_num: 0x%x)\n", + busy, dma_num); goto end; } end: @@ -10396,8 +10484,16 @@ static int gaudi2_memset_device_memory(struct hl_device *hdev, u64 addr, u64 siz } } + memset(lin_dma_pkts_arr, 0, sizeof(u64)); + + /* Zero the HBM area where we copied the CB */ + for (i = 0; i < cb_len / sizeof(u64); i += sizeof(u64)) + rc = hdev->asic_funcs->access_dev_mem(hdev, PCI_REGION_DRAM, + prop->dram_user_base_address + i, + (u64 *)(lin_dma_pkts_arr), DEBUGFS_WRITE64); WREG32(sob_addr, 0); - hl_asic_dma_free_coherent(hdev, pkt_size * num_of_pkts, lin_dma_pkts_arr, pkt_dma_addr); + + kfree(lin_dma_pkts_arr); return rc; } @@ -11455,7 +11551,7 @@ static int gaudi2_mmu_get_real_page_size(struct hl_device *hdev, struct hl_mmu_p return 0; page_size_err: - dev_err(hdev->dev, "page size of %u is not %uKB aligned, can't map\n", + dev_err(hdev->dev, "page size of 0x%X is not 0x%X aligned, can't map\n", page_size, mmu_prop->page_size >> 10); return -EFAULT; } @@ -11475,6 +11571,29 @@ int gaudi2_send_device_activity(struct hl_device *hdev, bool open) return hl_fw_send_device_activity(hdev, open); } +static u64 gaudi2_read_pte(struct hl_device *hdev, u64 addr) +{ + struct gaudi2_device *gaudi2 = hdev->asic_specific; + u64 val; + + if (hdev->reset_info.hard_reset_pending) + return U64_MAX; + + val = readq(hdev->pcie_bar[DRAM_BAR_ID] + (addr - gaudi2->dram_bar_cur_addr)); + + return val; +} + +static void gaudi2_write_pte(struct hl_device *hdev, u64 addr, u64 val) +{ + struct gaudi2_device *gaudi2 = hdev->asic_specific; + + if (hdev->reset_info.hard_reset_pending) + return; + + writeq(val, hdev->pcie_bar[DRAM_BAR_ID] + (addr - gaudi2->dram_bar_cur_addr)); +} + static const struct hl_asic_funcs gaudi2_funcs = { .early_init = gaudi2_early_init, .early_fini = gaudi2_early_fini, @@ -11511,8 +11630,8 @@ static const struct hl_asic_funcs gaudi2_funcs = { .add_device_attr = gaudi2_add_device_attr, .handle_eqe = gaudi2_handle_eqe, .get_events_stat = gaudi2_get_events_stat, - .read_pte = NULL, - .write_pte = NULL, + .read_pte = gaudi2_read_pte, + .write_pte = gaudi2_write_pte, .mmu_invalidate_cache = gaudi2_mmu_invalidate_cache, .mmu_invalidate_cache_range = gaudi2_mmu_invalidate_cache_range, .mmu_prefetch_cache_range = NULL, diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2P.h b/drivers/accel/habanalabs/gaudi2/gaudi2P.h index 9b9eef0d97d6e8..bc508c9cee5c50 100644 --- a/drivers/accel/habanalabs/gaudi2/gaudi2P.h +++ b/drivers/accel/habanalabs/gaudi2/gaudi2P.h @@ -19,8 +19,6 @@ #define GAUDI2_LINUX_FW_FILE "habanalabs/gaudi2/gaudi2-fit.itb" #define GAUDI2_BOOT_FIT_FILE "habanalabs/gaudi2/gaudi2-boot-fit.itb" -#define MMU_PAGE_TABLES_INITIAL_SIZE 0x10000000 /* 256MB */ - #define GAUDI2_CPU_TIMEOUT_USEC 30000000 /* 30s */ #define NUMBER_OF_PDMA_QUEUES 2 @@ -109,13 +107,11 @@ /* DRAM Memory Map */ #define CPU_FW_IMAGE_SIZE 0x10000000 /* 256MB */ - -/* This define should be used only when working in a debug mode without dram. - * When working with dram, the driver size will be calculated dynamically. - */ -#define NIC_DEFAULT_DRV_SIZE 0x20000000 /* 512MB */ - #define CPU_FW_IMAGE_ADDR DRAM_PHYS_BASE +#define PMMU_PAGE_TABLES_SIZE 0x10000000 /* 256MB */ +#define EDMA_PQS_SIZE SZ_2M +#define EDMA_SCRATCHPAD_SIZE SZ_1M +#define HMMU_PAGE_TABLES_SIZE SZ_1M #define NIC_NUMBER_OF_PORTS NIC_NUMBER_OF_ENGINES diff --git a/drivers/accel/habanalabs/include/hw_ip/mmu/mmu_general.h b/drivers/accel/habanalabs/include/hw_ip/mmu/mmu_general.h index d408feecd4834d..b4a5e95be35421 100644 --- a/drivers/accel/habanalabs/include/hw_ip/mmu/mmu_general.h +++ b/drivers/accel/habanalabs/include/hw_ip/mmu/mmu_general.h @@ -26,6 +26,8 @@ #define LAST_MASK 0x0000000000800ull #define FLAGS_MASK 0x0000000000FFFull +#define MMU_ARCH_3_HOPS 3 +#define MMU_ARCH_4_HOPS 4 #define MMU_ARCH_5_HOPS 5 #define MMU_ARCH_6_HOPS 6 From 8aba7a26817a7a87b5b852faf93838baaca20b06 Mon Sep 17 00:00:00 2001 From: Malkoot Khan Date: Thu, 28 Dec 2023 21:08:58 +0000 Subject: [PATCH 172/707] accel/habanalabs: Remove unnecessary braces from if statement The coding style in the Linux kernel prefers not to use braces for single-statement if conditions. This patch removes the unnecessary braces from an if statement in the file drivers/accel/habanalabs/common/command_submission.c, which also resolves a coding style warning. Signed-off-by: Malkoot Khan Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/accel/habanalabs/common/command_submission.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/accel/habanalabs/common/command_submission.c b/drivers/accel/habanalabs/common/command_submission.c index 3aa6eeef443b41..39e23d625a3cbb 100644 --- a/drivers/accel/habanalabs/common/command_submission.c +++ b/drivers/accel/habanalabs/common/command_submission.c @@ -1360,9 +1360,8 @@ static int hl_cs_sanity_checks(struct hl_fpriv *hpriv, union hl_cs_args *args) return -EINVAL; } - if (!hl_device_operational(hdev, &status)) { + if (!hl_device_operational(hdev, &status)) return -EBUSY; - } if ((args->in.cs_flags & HL_CS_FLAGS_STAGED_SUBMISSION) && !hdev->supports_staged_submission) { From 149972a8b935c9c82cd5517d49c80b3b9029bd28 Mon Sep 17 00:00:00 2001 From: Dani Liberman Date: Mon, 1 Jan 2024 22:37:43 +0200 Subject: [PATCH 173/707] accel/habanalabs: remove call to deprecated function In newer kernel versions, irq_set_affinity_hint() is deprecated. Instead, use the newer version which is irq_set_affinity_and_hint(). Signed-off-by: Dani Liberman Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/accel/habanalabs/common/device.c | 2 +- drivers/accel/habanalabs/gaudi2/gaudi2.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/accel/habanalabs/common/device.c b/drivers/accel/habanalabs/common/device.c index cf004baf5e6213..3b9e8a21d7df8b 100644 --- a/drivers/accel/habanalabs/common/device.c +++ b/drivers/accel/habanalabs/common/device.c @@ -2833,6 +2833,6 @@ void hl_set_irq_affinity(struct hl_device *hdev, int irq) return; } - if (irq_set_affinity_hint(irq, &hdev->irq_affinity_mask)) + if (irq_set_affinity_and_hint(irq, &hdev->irq_affinity_mask)) dev_err(hdev->dev, "Failed setting irq %d affinity\n", irq); } diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c index 5863c904913433..05e2170c815e6b 100644 --- a/drivers/accel/habanalabs/gaudi2/gaudi2.c +++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c @@ -4395,7 +4395,7 @@ static int gaudi2_enable_msix(struct hl_device *hdev) i < GAUDI2_IRQ_NUM_USER_FIRST + user_irq_init_cnt ; i++, j++) { irq = pci_irq_vector(hdev->pdev, i); - irq_set_affinity_hint(irq, NULL); + irq_set_affinity_and_hint(irq, NULL); free_irq(irq, &hdev->user_interrupt[j]); } irq = pci_irq_vector(hdev->pdev, GAUDI2_IRQ_NUM_UNEXPECTED_ERROR); @@ -4476,7 +4476,7 @@ static void gaudi2_disable_msix(struct hl_device *hdev) k < hdev->asic_prop.user_interrupt_count ; i++, j++, k++) { irq = pci_irq_vector(hdev->pdev, i); - irq_set_affinity_hint(irq, NULL); + irq_set_affinity_and_hint(irq, NULL); free_irq(irq, &hdev->user_interrupt[j]); } From 41e9304988a1eb5a7b3c02f210b76d50488f0731 Mon Sep 17 00:00:00 2001 From: Tomer Tayar Date: Tue, 2 Jan 2024 16:51:09 +0200 Subject: [PATCH 174/707] accel/habanalabs/gaudi2: fail memory memset when failing to copy QM packet to device gaudi2_memset_memory_chunk_using_edma_qm() calls the access_dev_mem() ASIC function, but ignores its return value. Add this missing check. Signed-off-by: Tomer Tayar Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/accel/habanalabs/gaudi2/gaudi2.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c index 05e2170c815e6b..1f061209ae2157 100644 --- a/drivers/accel/habanalabs/gaudi2/gaudi2.c +++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c @@ -10345,14 +10345,20 @@ static int gaudi2_memset_memory_chunk_using_edma_qm(struct hl_device *hdev, pkt_size = sizeof(struct packet_lin_dma); - for (i = 0; i < 3; i++) + for (i = 0; i < 3; i++) { rc = hdev->asic_funcs->access_dev_mem(hdev, PCI_REGION_DRAM, phys_addr + (i * sizeof(u64)), ((u64 *)(lin_dma_pkt)) + i, DEBUGFS_WRITE64); + if (rc) { + dev_err(hdev->dev, "Failed to copy lin_dma packet to HBM (%#llx)\n", + phys_addr); + return rc; + } + } rc = hl_hw_queue_send_cb_no_cmpl(hdev, hw_queue_id, pkt_size, phys_addr); if (rc) - dev_err(hdev->dev, "Failed to send lin dma packet to H/W queue %d\n", + dev_err(hdev->dev, "Failed to send lin_dma packet to H/W queue %d\n", hw_queue_id); return rc; From fedec9564504be39597a5ac215932e8cc017a364 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Sat, 6 Jan 2024 12:42:13 +0000 Subject: [PATCH 175/707] accel/habanalabs/goya: remove redundant assignment to pointer 'input' The pointer input is assigned a value that is not read, it is being re-assigned again later with the same value. Resolve this by moving the declaration to input into the if block. Cleans up clang scan build warning: warning: Value stored to 'input' during its initialization is never read [deadcode.DeadStores] Signed-off-by: Colin Ian King Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/accel/habanalabs/goya/goya_coresight.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/accel/habanalabs/goya/goya_coresight.c b/drivers/accel/habanalabs/goya/goya_coresight.c index 41cae5fd843b88..3827ea4c02f740 100644 --- a/drivers/accel/habanalabs/goya/goya_coresight.c +++ b/drivers/accel/habanalabs/goya/goya_coresight.c @@ -576,7 +576,6 @@ static int goya_config_spmu(struct hl_device *hdev, struct hl_debug_params *params) { u64 base_reg; - struct hl_debug_params_spmu *input = params->input; u64 *output; u32 output_arr_len; u32 events_num; @@ -592,7 +591,7 @@ static int goya_config_spmu(struct hl_device *hdev, base_reg = debug_spmu_regs[params->reg_idx] - CFG_BASE; if (params->enable) { - input = params->input; + struct hl_debug_params_spmu *input = params->input; if (!input) return -EINVAL; From dddb2e526a3650f3aa19c9ed315ae0f49c768810 Mon Sep 17 00:00:00 2001 From: Erick Archer Date: Sat, 20 Jan 2024 16:10:28 +0100 Subject: [PATCH 176/707] accel/habanalabs: use kcalloc() instead of kzalloc() As noted in the "Deprecated Interfaces, Language Features, Attributes, and Conventions" documentation [1], size calculations (especially multiplication) should not be performed in memory allocator (or similar) function arguments due to the risk of them overflowing. This could lead to values wrapping around and a smaller allocation being made than the caller was expecting. Using those allocations could lead to linear overflows of heap memory and other misbehaviors. So, use the purpose specific kcalloc() function instead of the argument size * count in the kzalloc() function. Link: https://www.kernel.org/doc/html/next/process/deprecated.html#open-coded-arithmetic-in-allocator-arguments [1] Link: https://github.com/KSPP/linux/issues/162 Signed-off-by: Erick Archer Reviewed-by: Gustavo A. R. Silva Signed-off-by: Oded Gabbay --- drivers/accel/habanalabs/common/mmu/mmu_v1.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/accel/habanalabs/common/mmu/mmu_v1.c b/drivers/accel/habanalabs/common/mmu/mmu_v1.c index 64b5c8fbb166d9..845d16aaa63741 100644 --- a/drivers/accel/habanalabs/common/mmu/mmu_v1.c +++ b/drivers/accel/habanalabs/common/mmu/mmu_v1.c @@ -43,7 +43,7 @@ static int dram_default_mapping_init(struct hl_ctx *ctx) /* add hop1 and hop2 */ total_hops = num_of_hop3 + 2; - ctx->dram_default_hops = kzalloc(HL_PTE_SIZE * total_hops, GFP_KERNEL); + ctx->dram_default_hops = kcalloc(total_hops, HL_PTE_SIZE, GFP_KERNEL); if (!ctx->dram_default_hops) return -ENOMEM; From 4e05c06df7d321e8fd8ceda2c8d4712f62ffb469 Mon Sep 17 00:00:00 2001 From: Aleksandrs Vinarskis Date: Thu, 21 Dec 2023 19:51:41 +0100 Subject: [PATCH 177/707] mfd: intel-lpss: Switch to generalized quirk table Introduce generic quirk table, and port existing walkaround for select Microsoft devices to it. This is a preparation for QUIRK_CLOCK_DIVIDER_UNITY. Signed-off-by: Aleksandrs Vinarskis Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20231221185142.9224-2-alex.vinarskis@gmail.com Signed-off-by: Lee Jones --- drivers/mfd/intel-lpss-pci.c | 23 +++++++++++++++-------- drivers/mfd/intel-lpss.c | 2 +- drivers/mfd/intel-lpss.h | 9 ++++++++- 3 files changed, 24 insertions(+), 10 deletions(-) diff --git a/drivers/mfd/intel-lpss-pci.c b/drivers/mfd/intel-lpss-pci.c index 4621d3950b8f9f..07713a2f694f32 100644 --- a/drivers/mfd/intel-lpss-pci.c +++ b/drivers/mfd/intel-lpss-pci.c @@ -23,12 +23,17 @@ #include "intel-lpss.h" -/* Some DSDTs have an unused GEXP ACPI device conflicting with I2C4 resources */ -static const struct pci_device_id ignore_resource_conflicts_ids[] = { - /* Microsoft Surface Go (version 1) I2C4 */ - { PCI_DEVICE_SUB(PCI_VENDOR_ID_INTEL, 0x9d64, 0x152d, 0x1182), }, - /* Microsoft Surface Go 2 I2C4 */ - { PCI_DEVICE_SUB(PCI_VENDOR_ID_INTEL, 0x9d64, 0x152d, 0x1237), }, +static const struct pci_device_id quirk_ids[] = { + { + /* Microsoft Surface Go (version 1) I2C4 */ + PCI_DEVICE_SUB(PCI_VENDOR_ID_INTEL, 0x9d64, 0x152d, 0x1182), + .driver_data = QUIRK_IGNORE_RESOURCE_CONFLICTS, + }, + { + /* Microsoft Surface Go 2 I2C4 */ + PCI_DEVICE_SUB(PCI_VENDOR_ID_INTEL, 0x9d64, 0x152d, 0x1237), + .driver_data = QUIRK_IGNORE_RESOURCE_CONFLICTS, + }, { } }; @@ -36,6 +41,7 @@ static int intel_lpss_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) { const struct intel_lpss_platform_info *data = (void *)id->driver_data; + const struct pci_device_id *quirk_pci_info; struct intel_lpss_platform_info *info; int ret; @@ -55,8 +61,9 @@ static int intel_lpss_pci_probe(struct pci_dev *pdev, info->mem = pci_resource_n(pdev, 0); info->irq = pci_irq_vector(pdev, 0); - if (pci_match_id(ignore_resource_conflicts_ids, pdev)) - info->ignore_resource_conflicts = true; + quirk_pci_info = pci_match_id(quirk_ids, pdev); + if (quirk_pci_info) + info->quirks = quirk_pci_info->driver_data; pdev->d3cold_delay = 0; diff --git a/drivers/mfd/intel-lpss.c b/drivers/mfd/intel-lpss.c index eff423f7dd2847..aafa0da5f8dbfd 100644 --- a/drivers/mfd/intel-lpss.c +++ b/drivers/mfd/intel-lpss.c @@ -412,7 +412,7 @@ int intel_lpss_probe(struct device *dev, return ret; lpss->cell->swnode = info->swnode; - lpss->cell->ignore_resource_conflicts = info->ignore_resource_conflicts; + lpss->cell->ignore_resource_conflicts = info->quirks & QUIRK_IGNORE_RESOURCE_CONFLICTS; intel_lpss_init_dev(lpss); diff --git a/drivers/mfd/intel-lpss.h b/drivers/mfd/intel-lpss.h index c1d72b117ed5e6..2fa9ef9162580e 100644 --- a/drivers/mfd/intel-lpss.h +++ b/drivers/mfd/intel-lpss.h @@ -11,16 +11,23 @@ #ifndef __MFD_INTEL_LPSS_H #define __MFD_INTEL_LPSS_H +#include #include +/* + * Some DSDTs have an unused GEXP ACPI device conflicting with I2C4 resources. + * Set to ignore resource conflicts with ACPI declared SystemMemory regions. + */ +#define QUIRK_IGNORE_RESOURCE_CONFLICTS BIT(0) + struct device; struct resource; struct software_node; struct intel_lpss_platform_info { struct resource *mem; - bool ignore_resource_conflicts; int irq; + unsigned int quirks; unsigned long clk_rate; const char *clk_con_id; const struct software_node *swnode; From b47f1f55e26b98bf6811137735d2d3bc3bc6c3bc Mon Sep 17 00:00:00 2001 From: Aleksandrs Vinarskis Date: Thu, 21 Dec 2023 19:51:42 +0100 Subject: [PATCH 178/707] mfd: intel-lpss: Introduce QUIRK_CLOCK_DIVIDER_UNITY for XPS 9530 Some devices (eg. Dell XPS 9530, 2023) due to a firmware bug have a misconfigured clock divider, which should've been 1:1. This introduces quirk which conditionally re-configures the clock divider to 1:1. Signed-off-by: Aleksandrs Vinarskis Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20231221185142.9224-3-alex.vinarskis@gmail.com Signed-off-by: Lee Jones --- drivers/mfd/intel-lpss-pci.c | 5 +++++ drivers/mfd/intel-lpss.c | 7 +++++++ drivers/mfd/intel-lpss.h | 5 +++++ 3 files changed, 17 insertions(+) diff --git a/drivers/mfd/intel-lpss-pci.c b/drivers/mfd/intel-lpss-pci.c index 07713a2f694f32..8c00e0c695c5b9 100644 --- a/drivers/mfd/intel-lpss-pci.c +++ b/drivers/mfd/intel-lpss-pci.c @@ -34,6 +34,11 @@ static const struct pci_device_id quirk_ids[] = { PCI_DEVICE_SUB(PCI_VENDOR_ID_INTEL, 0x9d64, 0x152d, 0x1237), .driver_data = QUIRK_IGNORE_RESOURCE_CONFLICTS, }, + { + /* Dell XPS 9530 (2023) */ + PCI_DEVICE_SUB(PCI_VENDOR_ID_INTEL, 0x51fb, 0x1028, 0x0beb), + .driver_data = QUIRK_CLOCK_DIVIDER_UNITY, + }, { } }; diff --git a/drivers/mfd/intel-lpss.c b/drivers/mfd/intel-lpss.c index aafa0da5f8dbfd..2a9018112dfc86 100644 --- a/drivers/mfd/intel-lpss.c +++ b/drivers/mfd/intel-lpss.c @@ -300,6 +300,7 @@ static int intel_lpss_register_clock_divider(struct intel_lpss *lpss, { char name[32]; struct clk *tmp = *clk; + int ret; snprintf(name, sizeof(name), "%s-enable", devname); tmp = clk_register_gate(NULL, name, __clk_get_name(tmp), 0, @@ -316,6 +317,12 @@ static int intel_lpss_register_clock_divider(struct intel_lpss *lpss, return PTR_ERR(tmp); *clk = tmp; + if (lpss->info->quirks & QUIRK_CLOCK_DIVIDER_UNITY) { + ret = clk_set_rate(tmp, lpss->info->clk_rate); + if (ret) + return ret; + } + snprintf(name, sizeof(name), "%s-update", devname); tmp = clk_register_gate(NULL, name, __clk_get_name(tmp), CLK_SET_RATE_PARENT, lpss->priv, 31, 0, NULL); diff --git a/drivers/mfd/intel-lpss.h b/drivers/mfd/intel-lpss.h index 2fa9ef9162580e..6f8f668f4c6f08 100644 --- a/drivers/mfd/intel-lpss.h +++ b/drivers/mfd/intel-lpss.h @@ -19,6 +19,11 @@ * Set to ignore resource conflicts with ACPI declared SystemMemory regions. */ #define QUIRK_IGNORE_RESOURCE_CONFLICTS BIT(0) +/* + * Some devices have misconfigured clock divider due to a firmware bug. + * Set this to force the clock divider to 1:1 ratio. + */ +#define QUIRK_CLOCK_DIVIDER_UNITY BIT(1) struct device; struct resource; From 5394040d0b67177344662bc2b928e9d67e8f431d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Fri, 29 Dec 2023 16:50:59 +0200 Subject: [PATCH 179/707] mfd: lpc_ich: Use ALIGN_DOWN() to obtain the start of the SPI base range MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of open coding, use ALIGN_DOWN() for alignment. Signed-off-by: Ilpo Järvinen Link: https://lore.kernel.org/r/20231229145059.6138-1-ilpo.jarvinen@linux.intel.com Signed-off-by: Lee Jones --- drivers/mfd/lpc_ich.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/mfd/lpc_ich.c b/drivers/mfd/lpc_ich.c index 73a0e7f9bd3116..f14901660147f5 100644 --- a/drivers/mfd/lpc_ich.c +++ b/drivers/mfd/lpc_ich.c @@ -38,6 +38,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include #include #include #include @@ -1321,7 +1322,7 @@ static int lpc_ich_init_spi(struct pci_dev *dev) case INTEL_SPI_BYT: pci_read_config_dword(dev, SPIBASE_BYT, &spi_base); if (spi_base & SPIBASE_BYT_EN) { - res->start = spi_base & ~(SPIBASE_BYT_SZ - 1); + res->start = ALIGN_DOWN(spi_base, SPIBASE_BYT_SZ); res->end = res->start + SPIBASE_BYT_SZ - 1; info->set_writeable = lpc_ich_byt_set_writeable; From 5a6a8580defaf01f5b59c95cdd702a3ae1c7f224 Mon Sep 17 00:00:00 2001 From: Fuyao Kashizuku Date: Wed, 27 Dec 2023 10:01:17 +0800 Subject: [PATCH 180/707] mfd: sun4i-gpadc: Correct specified GPADC interrupt numbers The identifiers are used as IRQ resource numbers, where 0 is treated specially. This fixes sun4i-gpadc-iio probe failed when request irq. The backstack: WARNING: CPU: 3 PID: 1 at drivers/base/platform.c:451 __platform_get_irq_byname+0xb8/0xc4 0 is an invalid IRQ number Modules linked in: CPU: 3 PID: 1 Comm: swapper/0 Not tainted 6.7.0-rc6 #9 Hardware name: Allwinner sun8i Family unwind_backtrace show_stack dump_stack_lvl __warn warn_slowpath_fmt __platform_get_irq_byname platform_get_irq_byname sun4i_irq_init sun4i_gpadc_probe platform_probe really_probe __driver_probe_device driver_probe_device __driver_attach bus_for_each_dev bus_add_driver driver_register do_one_initcall do_initcall_level do_initcalls kernel_init_freeable kernel_init Log reports: sun4i-gpadc-iio sun6i-a31-gpadc-iio.0: error -EINVAL: IRQ FIFO_DATA_PENDING not found sun4i-gpadc-iio: probe of sun6i-a31-gpadc-iio.0 failed with error -22 Signed-off-by: Fuyao Kashizuku Acked-by: Jernej Skrabec Link: https://lore.kernel.org/r/ZYuFbUUus9apiCpq@debian.cyg Signed-off-by: Lee Jones --- include/linux/mfd/sun4i-gpadc.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/mfd/sun4i-gpadc.h b/include/linux/mfd/sun4i-gpadc.h index ea0ccf33a459ef..021f820f9d52bd 100644 --- a/include/linux/mfd/sun4i-gpadc.h +++ b/include/linux/mfd/sun4i-gpadc.h @@ -81,8 +81,8 @@ #define SUN4I_GPADC_TEMP_DATA 0x20 #define SUN4I_GPADC_DATA 0x24 -#define SUN4I_GPADC_IRQ_FIFO_DATA 0 -#define SUN4I_GPADC_IRQ_TEMP_DATA 1 +#define SUN4I_GPADC_IRQ_FIFO_DATA 1 +#define SUN4I_GPADC_IRQ_TEMP_DATA 2 /* 10s delay before suspending the IP */ #define SUN4I_GPADC_AUTOSUSPEND_DELAY 10000 From 91c63e4f2f88696097e93ebf59fe4c7e07d1d4ab Mon Sep 17 00:00:00 2001 From: Lee Jones Date: Thu, 11 Jan 2024 16:21:13 +0000 Subject: [PATCH 181/707] mfd: omap-usb-host: Increase size of buffer to include all possible values MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Avoid these nasty W=1 errors: drivers/mfd/omap-usb-host.c: In function ‘usbhs_omap_probe’: drivers/mfd/omap-usb-host.c:706:54: error: ‘_clk’ directive output may be truncated writing 4 bytes into a region of size between 1 and 11 [-Werror=format-truncation=] drivers/mfd/omap-usb-host.c:705:17: note: ‘snprintf’ output between 24 and 34 bytes into a destination of size 30 drivers/mfd/omap-usb-host.c:721:56: error: ‘%d’ directive output may be truncated writing between 1 and 11 bytes into a region of size 8 [-Werror=format-truncation=] drivers/mfd/omap-usb-host.c:721:33: note: directive argument in the range [-2147483640, 2147483647] drivers/mfd/omap-usb-host.c:720:17: note: ‘snprintf’ output between 28 and 38 bytes into a destination of size 30 drivers/mfd/omap-usb-host.c:731:55: error: ‘%d’ directive output may be truncated writing between 1 and 11 bytes into a region of size 9 [-Werror=format-truncation=] drivers/mfd/omap-usb-host.c:731:33: note: directive argument in the range [-2147483640, 2147483647] drivers/mfd/omap-usb-host.c:730:17: note: ‘snprintf’ output between 27 and 37 bytes into a destination of size 30 Cc: Tony Lindgren Cc: Keshava Munegowda Cc: Roger Quadros Cc: linux-omap@vger.kernel.org Signed-off-by: Lee Jones --- drivers/mfd/omap-usb-host.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mfd/omap-usb-host.c b/drivers/mfd/omap-usb-host.c index ebc62033db169d..949feb03d4f8d8 100644 --- a/drivers/mfd/omap-usb-host.c +++ b/drivers/mfd/omap-usb-host.c @@ -699,7 +699,7 @@ static int usbhs_omap_probe(struct platform_device *pdev) } for (i = 0; i < omap->nports; i++) { - char clkname[30]; + char clkname[40]; /* clock names are indexed from 1*/ snprintf(clkname, sizeof(clkname), From 47f28ec99bef945f173251d26496e77f767ecf17 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Mon, 15 Jan 2024 19:20:42 +0100 Subject: [PATCH 182/707] dt-bindings: mfd: iqs62x: Do not override firmware-name $ref dtschema package defines firmware-name as string-array, so individual bindings should not make it a string but instead just narrow the number of expected firmware file names. Signed-off-by: Krzysztof Kozlowski Acked-by: Conor Dooley Acked-by: Jeff LaBundy Link: https://lore.kernel.org/r/20240115182042.1610134-1-krzysztof.kozlowski@linaro.org Signed-off-by: Lee Jones --- Documentation/devicetree/bindings/mfd/iqs62x.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/mfd/iqs62x.yaml b/Documentation/devicetree/bindings/mfd/iqs62x.yaml index 044cd7542c2bcf..f438c237496639 100644 --- a/Documentation/devicetree/bindings/mfd/iqs62x.yaml +++ b/Documentation/devicetree/bindings/mfd/iqs62x.yaml @@ -31,7 +31,7 @@ properties: maxItems: 1 firmware-name: - $ref: /schemas/types.yaml#/definitions/string + maxItems: 1 description: Specifies the name of the calibration and configuration file selected by the driver. If this property is omitted, the name is chosen based on the From 3b75d271e161e22aff8171940a77510d2fb2ad6f Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Sun, 14 Jan 2024 16:39:21 +0200 Subject: [PATCH 183/707] backlight: hx8357: Fix potential NULL pointer dereference The "im" pins are optional. Add missing check in the hx8357_probe(). Reported-by: Dan Carpenter Closes: https://lore.kernel.org/r/642e1230-3358-4006-a17f-3f297897ae74@moroto.mountain Fixes: 7d84a63a39b7 ("backlight: hx8357: Convert to agnostic GPIO API") Signed-off-by: Andy Shevchenko Reviewed-by: Daniel Thompson Link: https://lore.kernel.org/r/20240114143921.550736-1-andriy.shevchenko@linux.intel.com Signed-off-by: Lee Jones --- drivers/video/backlight/hx8357.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/video/backlight/hx8357.c b/drivers/video/backlight/hx8357.c index d7298376cf74dd..bf18337ff0c2c0 100644 --- a/drivers/video/backlight/hx8357.c +++ b/drivers/video/backlight/hx8357.c @@ -609,11 +609,13 @@ static int hx8357_probe(struct spi_device *spi) lcd->im_pins = devm_gpiod_get_array_optional(dev, "im", GPIOD_OUT_LOW); if (IS_ERR(lcd->im_pins)) return dev_err_probe(dev, PTR_ERR(lcd->im_pins), "failed to request im GPIOs\n"); - if (lcd->im_pins->ndescs < HX8357_NUM_IM_PINS) - return dev_err_probe(dev, -EINVAL, "not enough im GPIOs\n"); + if (lcd->im_pins) { + if (lcd->im_pins->ndescs < HX8357_NUM_IM_PINS) + return dev_err_probe(dev, -EINVAL, "not enough im GPIOs\n"); - for (i = 0; i < HX8357_NUM_IM_PINS; i++) - gpiod_set_consumer_name(lcd->im_pins->desc[i], "im_pins"); + for (i = 0; i < HX8357_NUM_IM_PINS; i++) + gpiod_set_consumer_name(lcd->im_pins->desc[i], "im_pins"); + } lcdev = devm_lcd_device_register(&spi->dev, "mxsfb", &spi->dev, lcd, &hx8357_ops); From cd84e6bd331fd556116ec4889dc282b07c392e42 Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Tue, 16 Jan 2024 03:08:27 +0200 Subject: [PATCH 184/707] dt-bindings: mfd: qcom,tcsr: Add compatibles for QCM2290 and SM6115 Add qcom,qcm2290-tcsr and qcom,sm6115-tcsr, compatibles for TCSR blocks on the corresponding platforms. Signed-off-by: Dmitry Baryshkov Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20240116-usbc-phy-vls-clamp-v1-1-73b2da7691c5@linaro.org Signed-off-by: Lee Jones --- Documentation/devicetree/bindings/mfd/qcom,tcsr.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/devicetree/bindings/mfd/qcom,tcsr.yaml b/Documentation/devicetree/bindings/mfd/qcom,tcsr.yaml index 798705ab6a4601..b97d77015335f1 100644 --- a/Documentation/devicetree/bindings/mfd/qcom,tcsr.yaml +++ b/Documentation/devicetree/bindings/mfd/qcom,tcsr.yaml @@ -19,6 +19,7 @@ properties: - enum: - qcom,msm8976-tcsr - qcom,msm8998-tcsr + - qcom,qcm2290-tcsr - qcom,qcs404-tcsr - qcom,sc7180-tcsr - qcom,sc7280-tcsr @@ -28,6 +29,7 @@ properties: - qcom,sdx55-tcsr - qcom,sdx65-tcsr - qcom,sm4450-tcsr + - qcom,sm6115-tcsr - qcom,sm8150-tcsr - qcom,sm8250-tcsr - qcom,sm8350-tcsr From a1958f84deb5cdba020af725fc5003a05af4819c Mon Sep 17 00:00:00 2001 From: Lukasz Majczak Date: Fri, 19 Jan 2024 08:43:27 +0000 Subject: [PATCH 185/707] mfd: cros_ec: Register EC-based watchdog subdevice Add ChromeOS EC-based watchdog as EC subdevice. Signed-off-by: Lukasz Majczak Link: https://lore.kernel.org/r/20240119084328.3135503-4-lma@chromium.org Signed-off-by: Lee Jones --- drivers/mfd/cros_ec_dev.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/mfd/cros_ec_dev.c b/drivers/mfd/cros_ec_dev.c index 603b1cd5278507..4996220ce64b7b 100644 --- a/drivers/mfd/cros_ec_dev.c +++ b/drivers/mfd/cros_ec_dev.c @@ -91,6 +91,10 @@ static const struct mfd_cell cros_usbpd_notify_cells[] = { { .name = "cros-usbpd-notify", }, }; +static const struct mfd_cell cros_ec_wdt_cells[] = { + { .name = "cros-ec-wdt", } +}; + static const struct cros_feature_to_cells cros_subdevices[] = { { .id = EC_FEATURE_CEC, @@ -107,6 +111,11 @@ static const struct cros_feature_to_cells cros_subdevices[] = { .mfd_cells = cros_usbpd_charger_cells, .num_cells = ARRAY_SIZE(cros_usbpd_charger_cells), }, + { + .id = EC_FEATURE_HANG_DETECT, + .mfd_cells = cros_ec_wdt_cells, + .num_cells = ARRAY_SIZE(cros_ec_wdt_cells), + }, }; static const struct mfd_cell cros_ec_platform_cells[] = { From 67421634ade0979dafd3e3f21c9b63bc93ef4760 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Tue, 23 Jan 2024 09:59:48 +0000 Subject: [PATCH 186/707] mfd: rave-sp: Avoid unnecessary use of comma operator Although it does not seem to have any untoward side-effects, the use of ';' to separate to assignments seems more appropriate than ','. Flagged by clang-17 -Wcomma No functional change intended. Compile tested only. Signed-off-by: Simon Horman Link: https://lore.kernel.org/r/20240123-rave-sp-comma-v1-1-84e9b15ba205@kernel.org Signed-off-by: Lee Jones --- drivers/mfd/rave-sp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mfd/rave-sp.c b/drivers/mfd/rave-sp.c index 6ff84b2600c543..ea5fbcbbe4a56f 100644 --- a/drivers/mfd/rave-sp.c +++ b/drivers/mfd/rave-sp.c @@ -358,7 +358,7 @@ int rave_sp_exec(struct rave_sp *sp, ackid = atomic_inc_return(&sp->ackid); reply.ackid = ackid; - reply.code = rave_sp_reply_code((u8)command), + reply.code = rave_sp_reply_code((u8)command); mutex_lock(&sp->bus_lock); From 1e0ea9e75ff3f395ad6f85f0be2258ef114a53dc Mon Sep 17 00:00:00 2001 From: Maciej Strozek Date: Tue, 23 Jan 2024 15:42:59 +0000 Subject: [PATCH 187/707] mfd: wm831x: Remove redundant forever while loop Current code executes only once despite the while loop, so remove the loop. Also msleep(1) will likely result in a larger sleep, so increase its value for clarity while keeping the same behaviour. Signed-off-by: Maciej Strozek Reviewed-by: Charles Keepax Link: https://lore.kernel.org/r/20240123154259.81258-1-mstrozek@opensource.cirrus.com Signed-off-by: Lee Jones --- drivers/mfd/wm831x-auxadc.c | 43 ++++++++++++++++--------------------- 1 file changed, 18 insertions(+), 25 deletions(-) diff --git a/drivers/mfd/wm831x-auxadc.c b/drivers/mfd/wm831x-auxadc.c index 65b98f3fbd9291..18618a8f92062e 100644 --- a/drivers/mfd/wm831x-auxadc.c +++ b/drivers/mfd/wm831x-auxadc.c @@ -152,7 +152,7 @@ static irqreturn_t wm831x_auxadc_irq(int irq, void *irq_data) static int wm831x_auxadc_read_polled(struct wm831x *wm831x, enum wm831x_auxadc input) { - int ret, src, timeout; + int ret, src; mutex_lock(&wm831x->auxadc_lock); @@ -179,32 +179,25 @@ static int wm831x_auxadc_read_polled(struct wm831x *wm831x, goto disable; } - /* If we're not using interrupts then poll the - * interrupt status register */ - timeout = 5; - while (timeout) { - msleep(1); + /* If we're not using interrupts then read the interrupt status register */ + msleep(20); - ret = wm831x_reg_read(wm831x, - WM831X_INTERRUPT_STATUS_1); - if (ret < 0) { - dev_err(wm831x->dev, - "ISR 1 read failed: %d\n", ret); - goto disable; - } + ret = wm831x_reg_read(wm831x, WM831X_INTERRUPT_STATUS_1); + if (ret < 0) { + dev_err(wm831x->dev, + "ISR 1 read failed: %d\n", ret); + goto disable; + } - /* Did it complete? */ - if (ret & WM831X_AUXADC_DATA_EINT) { - wm831x_reg_write(wm831x, - WM831X_INTERRUPT_STATUS_1, - WM831X_AUXADC_DATA_EINT); - break; - } else { - dev_err(wm831x->dev, - "AUXADC conversion timeout\n"); - ret = -EBUSY; - goto disable; - } + /* Did it complete? */ + if (ret & WM831X_AUXADC_DATA_EINT) { + wm831x_reg_write(wm831x, WM831X_INTERRUPT_STATUS_1, + WM831X_AUXADC_DATA_EINT); + } else { + dev_err(wm831x->dev, + "AUXADC conversion timeout\n"); + ret = -EBUSY; + goto disable; } ret = wm831x_reg_read(wm831x, WM831X_AUXADC_DATA); From 6b7704ff03d397788e75b8db78479222f0e80d3f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 23 Jan 2024 15:24:46 -0700 Subject: [PATCH 188/707] iov_iter: streamline iovec/bvec alignment iteration Rewrite the alignment checking iterators for iovec and bvec to be easier to read, and also significantly more compact in terms of generated code. This saves 270 bytes of text on x86-64 for me (with clang-18) and 224 bytes on arm64 (with gcc-13). In profiles, also saves a bit of time as well for the same workload: 0.81% -0.18% [kernel.vmlinux] [k] iov_iter_aligned_bvec 0.48% -0.09% [kernel.vmlinux] [k] iov_iter_is_aligned which is a nice side benefit as well. Signed-off-by: Jens Axboe Link: https://lore.kernel.org/r/544b31f7-6d4b-42f5-a544-1420501f081f@kernel.dk Reviewed-by: Keith Busch Signed-off-by: Christian Brauner v2: do the other half of the iterators too, as suggested by Keith. This further saves some text. --- lib/iov_iter.c | 55 +++++++++++++++++++++++++------------------------- 1 file changed, 28 insertions(+), 27 deletions(-) diff --git a/lib/iov_iter.c b/lib/iov_iter.c index e0aa6b440ca5f4..15f5040709c36e 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -714,12 +714,11 @@ EXPORT_SYMBOL(iov_iter_discard); static bool iov_iter_aligned_iovec(const struct iov_iter *i, unsigned addr_mask, unsigned len_mask) { + const struct iovec *iov = iter_iov(i); size_t size = i->count; size_t skip = i->iov_offset; - unsigned k; - for (k = 0; k < i->nr_segs; k++, skip = 0) { - const struct iovec *iov = iter_iov(i) + k; + do { size_t len = iov->iov_len - skip; if (len > size) @@ -729,34 +728,36 @@ static bool iov_iter_aligned_iovec(const struct iov_iter *i, unsigned addr_mask, if ((unsigned long)(iov->iov_base + skip) & addr_mask) return false; + iov++; size -= len; - if (!size) - break; - } + skip = 0; + } while (size); + return true; } static bool iov_iter_aligned_bvec(const struct iov_iter *i, unsigned addr_mask, unsigned len_mask) { - size_t size = i->count; + const struct bio_vec *bvec = i->bvec; unsigned skip = i->iov_offset; - unsigned k; + size_t size = i->count; - for (k = 0; k < i->nr_segs; k++, skip = 0) { - size_t len = i->bvec[k].bv_len - skip; + do { + size_t len = bvec->bv_len; if (len > size) len = size; if (len & len_mask) return false; - if ((unsigned long)(i->bvec[k].bv_offset + skip) & addr_mask) + if ((unsigned long)(bvec->bv_offset + skip) & addr_mask) return false; + bvec++; size -= len; - if (!size) - break; - } + skip = 0; + } while (size); + return true; } @@ -800,13 +801,12 @@ EXPORT_SYMBOL_GPL(iov_iter_is_aligned); static unsigned long iov_iter_alignment_iovec(const struct iov_iter *i) { + const struct iovec *iov = iter_iov(i); unsigned long res = 0; size_t size = i->count; size_t skip = i->iov_offset; - unsigned k; - for (k = 0; k < i->nr_segs; k++, skip = 0) { - const struct iovec *iov = iter_iov(i) + k; + do { size_t len = iov->iov_len - skip; if (len) { res |= (unsigned long)iov->iov_base + skip; @@ -814,30 +814,31 @@ static unsigned long iov_iter_alignment_iovec(const struct iov_iter *i) len = size; res |= len; size -= len; - if (!size) - break; } - } + iov++; + skip = 0; + } while (size); return res; } static unsigned long iov_iter_alignment_bvec(const struct iov_iter *i) { + const struct bio_vec *bvec = i->bvec; unsigned res = 0; size_t size = i->count; unsigned skip = i->iov_offset; - unsigned k; - for (k = 0; k < i->nr_segs; k++, skip = 0) { - size_t len = i->bvec[k].bv_len - skip; - res |= (unsigned long)i->bvec[k].bv_offset + skip; + do { + size_t len = bvec->bv_len - skip; + res |= (unsigned long)bvec->bv_offset + skip; if (len > size) len = size; res |= len; + bvec++; size -= len; - if (!size) - break; - } + skip = 0; + } while (size); + return res; } From 38c5f831b7aed53416db6c3b0297ea4cfac41294 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Wed, 24 Jan 2024 22:28:55 +0800 Subject: [PATCH 189/707] fs: make the i_size_read/write helpers be smp_load_acquire/store_release() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In [Link] Linus mentions that acquire/release makes it clear which _particular_ memory accesses are the ordered ones, and it's unlikely to make any performance difference, so it's much better to pair up the release->acquire ordering than have a "wmb->rmb" ordering. ========================================================= update pagecache folio_mark_uptodate(folio) smp_wmb() set_bit PG_uptodate === ↑↑↑ STLR ↑↑↑ === smp_store_release(&inode->i_size, i_size) folio_test_uptodate(folio) test_bit PG_uptodate smp_rmb() === ↓↓↓ LDAR ↓↓↓ === smp_load_acquire(&inode->i_size) copy_page_to_iter() ========================================================= Calling smp_store_release() in i_size_write() ensures that the data in the page and the PG_uptodate bit are updated before the isize is updated, and calling smp_load_acquire() in i_size_read ensures that it will not read a newer isize than the data in the page. Therefore, this avoids buffered read-write inconsistencies caused by Load-Load reordering. Link: https://lore.kernel.org/r/CAHk-=wifOnmeJq+sn+2s-P46zw0SFEbw9BSCGgp2c5fYPtRPGw@mail.gmail.com/ Suggested-by: Linus Torvalds Signed-off-by: Baokun Li Link: https://lore.kernel.org/r/20240124142857.4146716-2-libaokun1@huawei.com Signed-off-by: Christian Brauner --- include/linux/fs.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/include/linux/fs.h b/include/linux/fs.h index 4f6669147b9e8d..ebce4763b4bb9a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -907,7 +907,8 @@ static inline loff_t i_size_read(const struct inode *inode) preempt_enable(); return i_size; #else - return inode->i_size; + /* Pairs with smp_store_release() in i_size_write() */ + return smp_load_acquire(&inode->i_size); #endif } @@ -929,7 +930,12 @@ static inline void i_size_write(struct inode *inode, loff_t i_size) inode->i_size = i_size; preempt_enable(); #else - inode->i_size = i_size; + /* + * Pairs with smp_load_acquire() in i_size_read() to ensure + * changes related to inode size (such as page contents) are + * visible before we see the changed inode size. + */ + smp_store_release(&inode->i_size, i_size); #endif } From a17ab4403eaf06f54de8bd2f2217b4b69089ba93 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Wed, 24 Jan 2024 22:28:56 +0800 Subject: [PATCH 190/707] Revert "mm/filemap: avoid buffered read/write race to read inconsistent data" This reverts commit e2c27b803bb6 ("mm/filemap: avoid buffered read/write race to read inconsistent data"). After making the i_size_read/write helpers be smp_load_acquire/store_release(), it is already guaranteed that changes to page contents are visible before we see increased inode size, so the extra smp_rmb() in filemap_read() can be removed. Signed-off-by: Baokun Li Link: https://lore.kernel.org/r/20240124142857.4146716-3-libaokun1@huawei.com Signed-off-by: Christian Brauner --- mm/filemap.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 750e779c23db74..a72dd2eafd5ace 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2608,15 +2608,6 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter, goto put_folios; end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count); - /* - * Pairs with a barrier in - * block_write_end()->mark_buffer_dirty() or other page - * dirtying routines like iomap_write_end() to ensure - * changes to page contents are visible before we see - * increased inode size. - */ - smp_rmb(); - /* * Once we start copying data, we don't want to be touching any * cachelines that might be contended: From cf6e3cf145eb352e28812a741fde5065f1652ee8 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Wed, 24 Jan 2024 22:28:57 +0800 Subject: [PATCH 191/707] asm-generic: remove extra type checking in acquire/release for non-SMP case If CONFIG_SMP is not enabled, the smp_load_acquire/smp_store_release is implemented as READ_ONCE/READ_ONCE and barrier() and type checking. READ_ONCE/READ_ONCE already checks the pointer type, and then checks it more stringently outside, but the non-SMP case simply isn't relevant, so remove the extra compiletime_assert_atomic_type() to avoid compilation errors. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202401230837.TXro0PHi-lkp@intel.com/ Suggested-by: Linus Torvalds Signed-off-by: Baokun Li Link: https://lore.kernel.org/r/20240124142857.4146716-4-libaokun1@huawei.com Signed-off-by: Christian Brauner --- include/asm-generic/barrier.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/include/asm-generic/barrier.h b/include/asm-generic/barrier.h index 961f4d88f9ef78..0c0695763bea39 100644 --- a/include/asm-generic/barrier.h +++ b/include/asm-generic/barrier.h @@ -193,7 +193,6 @@ do { \ #ifndef smp_store_release #define smp_store_release(p, v) \ do { \ - compiletime_assert_atomic_type(*p); \ barrier(); \ WRITE_ONCE(*p, v); \ } while (0) @@ -203,7 +202,6 @@ do { \ #define smp_load_acquire(p) \ ({ \ __unqual_scalar_typeof(*p) ___p1 = READ_ONCE(*p); \ - compiletime_assert_atomic_type(*p); \ barrier(); \ (typeof(*p))___p1; \ }) From 458a1af5373e269a1f253440c35dac48ce107727 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 25 Jan 2024 17:17:34 +0100 Subject: [PATCH 192/707] pidfd: cleanup the usage of __pidfd_prepare's flags - make pidfd_create() static. - Don't pass O_RDWR | O_CLOEXEC to __pidfd_prepare() in copy_process(), __pidfd_prepare() adds these flags unconditionally. - Kill the flags check in __pidfd_prepare(). sys_pidfd_open() checks the flags itself, all other users of pidfd_prepare() pass flags = 0. If we need a sanity check for those other in kernel users then WARN_ON_ONCE(flags & ~PIDFD_NONBLOCK) makes more sense. - Don't pass O_RDWR to get_unused_fd_flags(), it ignores everything except O_CLOEXEC. - Don't pass O_CLOEXEC to anon_inode_getfile(), it ignores everything except O_ACCMODE | O_NONBLOCK. Signed-off-by: Oleg Nesterov Link: https://lore.kernel.org/r/20240125161734.GA778@redhat.com Signed-off-by: Christian Brauner --- include/linux/pid.h | 1 - kernel/fork.c | 9 +++------ kernel/pid.c | 2 +- 3 files changed, 4 insertions(+), 8 deletions(-) diff --git a/include/linux/pid.h b/include/linux/pid.h index 395cacce1179ca..e6a041cb8bacc7 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -73,7 +73,6 @@ struct file; extern struct pid *pidfd_pid(const struct file *file); struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags); struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags); -int pidfd_create(struct pid *pid, unsigned int flags); int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret); static inline struct pid *get_pid(struct pid *pid) diff --git a/kernel/fork.c b/kernel/fork.c index 47ff3b35352e0b..34704d277b68a5 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2130,15 +2130,12 @@ static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **re int pidfd; struct file *pidfd_file; - if (flags & ~(O_NONBLOCK | O_RDWR | O_CLOEXEC)) - return -EINVAL; - - pidfd = get_unused_fd_flags(O_RDWR | O_CLOEXEC); + pidfd = get_unused_fd_flags(O_CLOEXEC); if (pidfd < 0) return pidfd; pidfd_file = anon_inode_getfile("[pidfd]", &pidfd_fops, pid, - flags | O_RDWR | O_CLOEXEC); + flags | O_RDWR); if (IS_ERR(pidfd_file)) { put_unused_fd(pidfd); return PTR_ERR(pidfd_file); @@ -2524,7 +2521,7 @@ __latent_entropy struct task_struct *copy_process( */ if (clone_flags & CLONE_PIDFD) { /* Note that no task has been attached to @pid yet. */ - retval = __pidfd_prepare(pid, O_RDWR | O_CLOEXEC, &pidfile); + retval = __pidfd_prepare(pid, 0, &pidfile); if (retval < 0) goto bad_fork_free_pid; pidfd = retval; diff --git a/kernel/pid.c b/kernel/pid.c index b52b1086545415..c7a3e359f8f590 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -595,7 +595,7 @@ struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags) * Return: On success, a cloexec pidfd is returned. * On error, a negative errno number will be returned. */ -int pidfd_create(struct pid *pid, unsigned int flags) +static int pidfd_create(struct pid *pid, unsigned int flags) { int pidfd; struct file *pidfd_file; From 73b9e13998fa51839f051873ab03967fdf3fe795 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 24 Nov 2023 14:55:37 -0800 Subject: [PATCH 193/707] doc: Spinlocks are implied RCU readers In kernels built with CONFIG_PREEMPT_RT=n, spinlock critical sections are RCU readers because they disable preemption. However, they are also RCU readers in CONFIG_PREEMPT_RT=y because the -rt locking primitives contain rcu_read_lock() and rcu_read_unlock(). Therefore, upgrade rcu_dereference.rst to document this non-obvious case. Reported-by: Linus Torvalds Signed-off-by: Paul E. McKenney --- Documentation/RCU/rcu_dereference.rst | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/Documentation/RCU/rcu_dereference.rst b/Documentation/RCU/rcu_dereference.rst index 659d5913784d0d..2524dcdadde2b8 100644 --- a/Documentation/RCU/rcu_dereference.rst +++ b/Documentation/RCU/rcu_dereference.rst @@ -408,7 +408,10 @@ member of the rcu_dereference() to use in various situations: RCU flavors, an RCU read-side critical section is entered using rcu_read_lock(), anything that disables bottom halves, anything that disables interrupts, or anything that disables - preemption. + preemption. Please note that spinlock critical sections + are also implied RCU read-side critical sections, even when + they are preemptible, as they are in kernels built with + CONFIG_PREEMPT_RT=y. 2. If the access might be within an RCU read-side critical section on the one hand, or protected by (say) my_lock on the other, From cdc2686695c2495f5802e7baae07479576180392 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 24 Nov 2023 15:06:46 -0800 Subject: [PATCH 194/707] doc: Make whatisRCU.rst note that spinlocks are RCU readers In kernels built with CONFIG_PREEMPT_RT=n, spinlock critical sections are RCU readers because they disable preemption. However, they are also RCU readers in CONFIG_PREEMPT_RT=y because in that case the locking primitives contain rcu_read_lock() and rcu_read_unlock(). Therefore, upgrade whatisRCU.rst to document this non-obvious case. Signed-off-by: Paul E. McKenney --- Documentation/RCU/whatisRCU.rst | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/Documentation/RCU/whatisRCU.rst b/Documentation/RCU/whatisRCU.rst index 60ce02475142d8..246ce0d0b4d116 100644 --- a/Documentation/RCU/whatisRCU.rst +++ b/Documentation/RCU/whatisRCU.rst @@ -172,14 +172,25 @@ rcu_read_lock() critical section. Reference counts may be used in conjunction with RCU to maintain longer-term references to data structures. + Note that anything that disables bottom halves, preemption, + or interrupts also enters an RCU read-side critical section. + Acquiring a spinlock also enters an RCU read-side critical + sections, even for spinlocks that do not disable preemption, + as is the case in kernels built with CONFIG_PREEMPT_RT=y. + Sleeplocks do *not* enter RCU read-side critical sections. + rcu_read_unlock() ^^^^^^^^^^^^^^^^^ void rcu_read_unlock(void); This temporal primitives is used by a reader to inform the reclaimer that the reader is exiting an RCU read-side critical - section. Note that RCU read-side critical sections may be nested - and/or overlapping. + section. Anything that enables bottom halves, preemption, + or interrupts also exits an RCU read-side critical section. + Releasing a spinlock also exits an RCU read-side critical section. + + Note that RCU read-side critical sections may be nested and/or + overlapping. synchronize_rcu() ^^^^^^^^^^^^^^^^^ From 2888f00828d7779a4045b501fc5eb577b42248b5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 24 Nov 2023 16:23:56 -0800 Subject: [PATCH 195/707] doc: Make checklist.rst note that spinlocks are implied RCU readers In kernels built with CONFIG_PREEMPT_RT=n, spinlock critical sections are RCU readers because they disable preemption. However, they are also RCU readers in CONFIG_PREEMPT_RT=y because in that case the locking primitives contain rcu_read_lock() and rcu_read_unlock(). Therefore, upgrade checklist.rst to document this non-obvious case. While in the area, fix a typo by changing "read-side critical" to "read-side critical section". Signed-off-by: Paul E. McKenney --- Documentation/RCU/checklist.rst | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Documentation/RCU/checklist.rst b/Documentation/RCU/checklist.rst index 2d42998a89a637..98a622f7724816 100644 --- a/Documentation/RCU/checklist.rst +++ b/Documentation/RCU/checklist.rst @@ -68,7 +68,8 @@ over a rather long period of time, but improvements are always welcome! rcu_read_lock_sched(), or by the appropriate update-side lock. Explicit disabling of preemption (preempt_disable(), for example) can serve as rcu_read_lock_sched(), but is less readable and - prevents lockdep from detecting locking issues. + prevents lockdep from detecting locking issues. Acquiring a + spinlock also enters an RCU read-side critical section. Please note that you *cannot* rely on code known to be built only in non-preemptible kernels. Such code can and will break, @@ -444,7 +445,7 @@ over a rather long period of time, but improvements are always welcome! real-time workloads than is synchronize_rcu_expedited(). It is also permissible to sleep in RCU Tasks Trace read-side - critical, which are delimited by rcu_read_lock_trace() and + critical section, which are delimited by rcu_read_lock_trace() and rcu_read_unlock_trace(). However, this is a specialized flavor of RCU, and you should not use it without first checking with its current users. In most cases, you should instead use SRCU. From effe2f73f97f332b593e4fb6d466cc9485ff998c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 24 Nov 2023 16:29:01 -0800 Subject: [PATCH 196/707] doc: Add CONFIG_RCU_STRICT_GRACE_PERIOD to checklist.rst This commit adds CONFIG_RCU_STRICT_GRACE_PERIOD to the list of debugging Kconfig options in checklist.rst. Signed-off-by: Paul E. McKenney --- Documentation/RCU/checklist.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Documentation/RCU/checklist.rst b/Documentation/RCU/checklist.rst index 98a622f7724816..addd5c1547a420 100644 --- a/Documentation/RCU/checklist.rst +++ b/Documentation/RCU/checklist.rst @@ -491,6 +491,12 @@ over a rather long period of time, but improvements are always welcome! since the last time that you passed that same object to call_rcu() (or friends). + CONFIG_RCU_STRICT_GRACE_PERIOD: + combine with KASAN to check for pointers leaked out + of RCU read-side critical sections. This Kconfig + option is tough on both performance and scalability, + and so is limited to four-CPU systems. + __rcu sparse checks: tag the pointer to the RCU-protected data structure with __rcu, and sparse will warn you if you access that From 10514fa5f7a55149cf7fc63138f39d35763374bf Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 26 Nov 2023 11:06:10 -0800 Subject: [PATCH 197/707] doc: Add EARLY flag to early-parsed kernel boot parameters Kernel boot parameters declared with early_param() are parsed before embedded parameters are extracted from initrd, and early_param() parameters are not helpful when embedded in initrd. Therefore, mark early_param() kernel boot parameters with "EARLY" in kernel-parameters.txt. The following early_param() calls declare kernel boot parameters that are undocumented: early_param("atmel.pm_modes", at91_pm_modes_select); early_param("mem_fclk_21285", early_fclk); early_param("ecc", early_ecc); early_param("cachepolicy", early_cachepolicy); early_param("nodebugmon", early_debug_disable); early_param("kfence.sample_interval", parse_kfence_early_init); early_param("additional_cpus", setup_additional_cpus); early_param("stram_pool", atari_stram_setup); early_param("disable_octeon_edac", disable_octeon_edac); early_param("rd_start", rd_start_early); early_param("rd_size", rd_size_early); early_param("coherentio", setcoherentio); early_param("nocoherentio", setnocoherentio); early_param("fadump", early_fadump_param); early_param("fadump_reserve_mem", early_fadump_reserve_mem); early_param("no_stf_barrier", handle_no_stf_barrier); early_param("no_rfi_flush", handle_no_rfi_flush); early_param("smt-enabled", early_smt_enabled); early_param("ppc_pci_reset_phbs", pci_reset_phbs_setup); early_param("ps3fb", early_parse_ps3fb); early_param("ps3flash", early_parse_ps3flash); early_param("novx", disable_vector_extension); early_param("nobp", nobp_setup_early); early_param("nospec", nospec_setup_early); early_param("possible_cpus", _setup_possible_cpus); early_param("stp", early_parse_stp); early_param("nopfault", nopfault); early_param("nmi_mode", nmi_mode_setup); early_param("sh_mv", early_parse_mv); early_param("pmb", early_pmb); early_param("hvirq", early_hvirq_major); early_param("cfi", cfi_parse_cmdline); early_param("disableapic", setup_disableapic); early_param("noapictimer", parse_disable_apic_timer); early_param("disable_cpu_apicid", apic_set_disabled_cpu_apicid); early_param("uv_memblksize", parse_mem_block_size); early_param("retbleed", retbleed_parse_cmdline); early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall); early_param("update_mptable", update_mptable_setup); early_param("alloc_mptable", parse_alloc_mptable_opt); early_param("possible_cpus", _setup_possible_cpus); early_param("lsmsi", early_parse_ls_scfg_msi); early_param("nokgdbroundup", opt_nokgdbroundup); early_param("kgdbcon", opt_kgdb_con); early_param("kasan", early_kasan_flag); early_param("kasan.mode", early_kasan_mode); early_param("kasan.vmalloc", early_kasan_flag_vmalloc); early_param("kasan.page_alloc.sample", early_kasan_flag_page_alloc_sample); early_param("kasan.page_alloc.sample.order", early_kasan_flag_page_alloc_sample_order); early_param("kasan.fault", early_kasan_fault); early_param("kasan.stacktrace", early_kasan_flag_stacktrace); early_param("kasan.stack_ring_size", early_kasan_flag_stack_ring_size); early_param("accept_memory", accept_memory_parse); early_param("page_table_check", early_page_table_check_param); sh_early_platform_init("earlytimer", &sh_cmt_device_driver); early_param_on_off("gbpages", "nogbpages", direct_gbpages, CONFIG_X86_DIRECT_GBPAGES); These are not necessarily bugs, given that some kernel boot parameters are intended for deep debugging rather than general use. This work does not cover all of the kernel boot parameters declared using cmdline_find_option() and cmdline_find_option_bool(). If these are in fact guaranteed to be early (which appears to be the case), they can be added in a later version of this patch. Signed-off-by: Paul E. McKenney Cc: Jonathan Corbet Cc: Masami Hiramatsu Cc: Petr Malat Cc: Randy Dunlap Cc: Cc: --- .../admin-guide/kernel-parameters.rst | 1 + .../admin-guide/kernel-parameters.txt | 484 +++++++++--------- 2 files changed, 250 insertions(+), 235 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.rst b/Documentation/admin-guide/kernel-parameters.rst index 102937bc8443a2..fe4644df4b4703 100644 --- a/Documentation/admin-guide/kernel-parameters.rst +++ b/Documentation/admin-guide/kernel-parameters.rst @@ -108,6 +108,7 @@ is applicable:: CMA Contiguous Memory Area support is enabled. DRM Direct Rendering Management support is enabled. DYNAMIC_DEBUG Build in debug messages and enable them at runtime + EARLY Parameter processed too early to be embedded in initrd. EDD BIOS Enhanced Disk Drive Services (EDD) is enabled EFI EFI Partitioning (GPT) is enabled EVM Extended Verification Module diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 31b3a25680d08c..4839f2919fdfa6 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -9,7 +9,7 @@ accept_memory=eager can be used to accept all memory at once during boot. - acpi= [HW,ACPI,X86,ARM64,RISCV64] + acpi= [HW,ACPI,X86,ARM64,RISCV64,EARLY] Advanced Configuration and Power Interface Format: { force | on | off | strict | noirq | rsdt | copy_dsdt } @@ -26,7 +26,7 @@ See also Documentation/power/runtime_pm.rst, pci=noacpi - acpi_apic_instance= [ACPI, IOAPIC] + acpi_apic_instance= [ACPI,IOAPIC,EARLY] Format: 2: use 2nd APIC table, if available 1,0: use 1st APIC table @@ -41,7 +41,7 @@ If set to native, use the device's native backlight mode. If set to none, disable the ACPI backlight interface. - acpi_force_32bit_fadt_addr + acpi_force_32bit_fadt_addr [ACPI,EARLY] force FADT to use 32 bit addresses rather than the 64 bit X_* addresses. Some firmware have broken 64 bit addresses for force ACPI ignore these and use @@ -97,7 +97,7 @@ no: ACPI OperationRegions are not marked as reserved, no further checks are performed. - acpi_force_table_verification [HW,ACPI] + acpi_force_table_verification [HW,ACPI,EARLY] Enable table checksum verification during early stage. By default, this is disabled due to x86 early mapping size limitation. @@ -137,7 +137,7 @@ acpi_no_memhotplug [ACPI] Disable memory hotplug. Useful for kdump kernels. - acpi_no_static_ssdt [HW,ACPI] + acpi_no_static_ssdt [HW,ACPI,EARLY] Disable installation of static SSDTs at early boot time By default, SSDTs contained in the RSDT/XSDT will be installed automatically and they will appear under @@ -151,7 +151,7 @@ Ignore the ACPI-based watchdog interface (WDAT) and let a native driver control the watchdog device instead. - acpi_rsdp= [ACPI,EFI,KEXEC] + acpi_rsdp= [ACPI,EFI,KEXEC,EARLY] Pass the RSDP address to the kernel, mostly used on machines running EFI runtime service to boot the second kernel for kdump. @@ -228,10 +228,10 @@ to assume that this machine's pmtimer latches its value and always returns good values. - acpi_sci= [HW,ACPI] ACPI System Control Interrupt trigger mode + acpi_sci= [HW,ACPI,EARLY] ACPI System Control Interrupt trigger mode Format: { level | edge | high | low } - acpi_skip_timer_override [HW,ACPI] + acpi_skip_timer_override [HW,ACPI,EARLY] Recognize and ignore IRQ0/pin2 Interrupt Override. For broken nForce2 BIOS resulting in XT-PIC timer. @@ -266,11 +266,11 @@ behave incorrectly in some ways with respect to system suspend and resume to be ignored (use wisely). - acpi_use_timer_override [HW,ACPI] + acpi_use_timer_override [HW,ACPI,EARLY] Use timer override. For some broken Nvidia NF5 boards that require a timer override, but don't have HPET - add_efi_memmap [EFI; X86] Include EFI memory map in + add_efi_memmap [EFI,X86,EARLY] Include EFI memory map in kernel's map of available physical RAM. agp= [AGP] @@ -307,7 +307,7 @@ do not want to use tracing_snapshot_alloc() as it needs to be done where GFP_KERNEL allocations are allowed. - allow_mismatched_32bit_el0 [ARM64] + allow_mismatched_32bit_el0 [ARM64,EARLY] Allow execve() of 32-bit applications and setting of the PER_LINUX32 personality on systems where only a strict subset of the CPUs support 32-bit EL0. When this @@ -351,7 +351,7 @@ This mode requires kvm-amd.avic=1. (Default when IOMMU HW support is present.) - amd_pstate= [X86] + amd_pstate= [X86,EARLY] disable Do not enable amd_pstate as the default scaling driver for the supported processors @@ -391,7 +391,7 @@ not play well with APC CPU idle - disable it if you have APC and your system crashes randomly. - apic= [APIC,X86] Advanced Programmable Interrupt Controller + apic= [APIC,X86,EARLY] Advanced Programmable Interrupt Controller Change the output verbosity while booting Format: { quiet (default) | verbose | debug } Change the amount of debugging information output @@ -401,7 +401,7 @@ Format: apic=driver_name Examples: apic=bigsmp - apic_extnmi= [APIC,X86] External NMI delivery setting + apic_extnmi= [APIC,X86,EARLY] External NMI delivery setting Format: { bsp (default) | all | none } bsp: External NMI is delivered only to CPU 0 all: External NMIs are broadcast to all CPUs as a @@ -508,21 +508,22 @@ bert_disable [ACPI] Disable BERT OS support on buggy BIOSes. - bgrt_disable [ACPI][X86] + bgrt_disable [ACPI,X86,EARLY] Disable BGRT to avoid flickering OEM logo. blkdevparts= Manual partition parsing of block device(s) for embedded devices based on command line input. See Documentation/block/cmdline-partition.rst - boot_delay= Milliseconds to delay each printk during boot. + boot_delay= [KNL,EARLY] + Milliseconds to delay each printk during boot. Only works if CONFIG_BOOT_PRINTK_DELAY is enabled, and you may also have to specify "lpj=". Boot_delay values larger than 10 seconds (10000) are assumed erroneous and ignored. Format: integer - bootconfig [KNL] + bootconfig [KNL,EARLY] Extended command line options can be added to an initrd and this will cause the kernel to look for it. @@ -557,7 +558,7 @@ trust validation. format: { id: | builtin } - cca= [MIPS] Override the kernel pages' cache coherency + cca= [MIPS,EARLY] Override the kernel pages' cache coherency algorithm. Accepted values range from 0 to 7 inclusive. See arch/mips/include/asm/pgtable-bits.h for platform specific values (SB1, Loongson3 and @@ -672,7 +673,7 @@ [X86-64] hpet,tsc clocksource.arm_arch_timer.evtstrm= - [ARM,ARM64] + [ARM,ARM64,EARLY] Format: Enable/disable the eventstream feature of the ARM architected timer so that code using WFE-based polling @@ -702,7 +703,7 @@ 10 seconds when built into the kernel. cma=nn[MG]@[start[MG][-end[MG]]] - [KNL,CMA] + [KNL,CMA,EARLY] Sets the size of kernel global memory area for contiguous memory allocations and optionally the placement constraint by the physical address range of @@ -711,7 +712,7 @@ kernel/dma/contiguous.c cma_pernuma=nn[MG] - [KNL,CMA] + [KNL,CMA,EARLY] Sets the size of kernel per-numa memory area for contiguous memory allocations. A value of 0 disables per-numa CMA altogether. And If this option is not @@ -722,7 +723,7 @@ they will fallback to the global default memory area. numa_cma=:nn[MG][,:nn[MG]] - [KNL,CMA] + [KNL,CMA,EARLY] Sets the size of kernel numa memory area for contiguous memory allocations. It will reserve CMA area for the specified node. @@ -739,7 +740,7 @@ a hypervisor. Default: yes - coherent_pool=nn[KMG] [ARM,KNL] + coherent_pool=nn[KMG] [ARM,KNL,EARLY] Sets the size of memory pool for coherent, atomic dma allocations, by default set to 256K. @@ -757,7 +758,7 @@ condev= [HW,S390] console device conmode= - con3215_drop= [S390] 3215 console drop mode. + con3215_drop= [S390,EARLY] 3215 console drop mode. Format: y|n|Y|N|1|0 When set to true, drop data on the 3215 console when the console buffer is full. In this case the @@ -863,7 +864,7 @@ kernel before the cpufreq driver probes. cpu_init_udelay=N - [X86] Delay for N microsec between assert and de-assert + [X86,EARLY] Delay for N microsec between assert and de-assert of APIC INIT to start processors. This delay occurs on every CPU online, such as boot, and resume from suspend. Default: 10000 @@ -883,7 +884,7 @@ kernel more unstable. crashkernel=size[KMG][@offset[KMG]] - [KNL] Using kexec, Linux can switch to a 'crash kernel' + [KNL,EARLY] Using kexec, Linux can switch to a 'crash kernel' upon panic. This parameter reserves the physical memory region [offset, offset + size] for that kernel image. If '@offset' is omitted, then a suitable offset @@ -954,10 +955,10 @@ Format: , See also Documentation/input/devices/joystick-parport.rst - debug [KNL] Enable kernel debugging (events log level). + debug [KNL,EARLY] Enable kernel debugging (events log level). debug_boot_weak_hash - [KNL] Enable printing [hashed] pointers early in the + [KNL,EARLY] Enable printing [hashed] pointers early in the boot sequence. If enabled, we use a weak hash instead of siphash to hash pointers. Use this option if you are seeing instances of '(___ptrval___)') and need to see a @@ -974,10 +975,10 @@ will print _a_lot_ more information - normally only useful to lockdep developers. - debug_objects [KNL] Enable object debugging + debug_objects [KNL,EARLY] Enable object debugging debug_guardpage_minorder= - [KNL] When CONFIG_DEBUG_PAGEALLOC is set, this + [KNL,EARLY] When CONFIG_DEBUG_PAGEALLOC is set, this parameter allows control of the order of pages that will be intentionally kept free (and hence protected) by the buddy allocator. Bigger value increase the probability @@ -996,7 +997,7 @@ help tracking down these problems. debug_pagealloc= - [KNL] When CONFIG_DEBUG_PAGEALLOC is set, this parameter + [KNL,EARLY] When CONFIG_DEBUG_PAGEALLOC is set, this parameter enables the feature at boot time. By default, it is disabled and the system will work mostly the same as a kernel built without CONFIG_DEBUG_PAGEALLOC. @@ -1004,8 +1005,8 @@ useful to also enable the page_owner functionality. on: enable the feature - debugfs= [KNL] This parameter enables what is exposed to userspace - and debugfs internal clients. + debugfs= [KNL,EARLY] This parameter enables what is exposed to + userspace and debugfs internal clients. Format: { on, no-mount, off } on: All functions are enabled. no-mount: @@ -1084,7 +1085,7 @@ dhash_entries= [KNL] Set number of hash buckets for dentry cache. - disable_1tb_segments [PPC] + disable_1tb_segments [PPC,EARLY] Disables the use of 1TB hash page table segments. This causes the kernel to fall back to 256MB segments which can be useful when debugging issues that require an SLB @@ -1093,7 +1094,7 @@ disable= [IPV6] See Documentation/networking/ipv6.rst. - disable_radix [PPC] + disable_radix [PPC,EARLY] Disable RADIX MMU mode on POWER9 disable_tlbie [PPC] @@ -1109,25 +1110,25 @@ causing system reset or hang due to sending INIT from AP to BSP. - disable_ddw [PPC/PSERIES] + disable_ddw [PPC/PSERIES,EARLY] Disable Dynamic DMA Window support. Use this to workaround buggy firmware. disable_ipv6= [IPV6] See Documentation/networking/ipv6.rst. - disable_mtrr_cleanup [X86] + disable_mtrr_cleanup [X86,EARLY] The kernel tries to adjust MTRR layout from continuous to discrete, to make X server driver able to add WB entry later. This parameter disables that. - disable_mtrr_trim [X86, Intel and AMD only] + disable_mtrr_trim [X86, Intel and AMD only,EARLY] By default the kernel will trim any uncacheable memory out of your available memory pool based on MTRR settings. This parameter disables that behavior, possibly causing your machine to run very slowly. - disable_timer_pin_1 [X86] + disable_timer_pin_1 [X86,EARLY] Disable PIN 1 of APIC timer Can be useful to work around chipset bugs. @@ -1177,7 +1178,7 @@ dscc4.setup= [NET] - dt_cpu_ftrs= [PPC] + dt_cpu_ftrs= [PPC,EARLY] Format: {"off" | "known"} Control how the dt_cpu_ftrs device-tree binding is used for CPU feature discovery and setup (if it @@ -1197,12 +1198,12 @@ Documentation/admin-guide/dynamic-debug-howto.rst for details. - early_ioremap_debug [KNL] + early_ioremap_debug [KNL,EARLY] Enable debug messages in early_ioremap support. This is useful for tracking down temporary early mappings which are not unmapped. - earlycon= [KNL] Output early console device and options. + earlycon= [KNL,EARLY] Output early console device and options. When used with no options, the early console is determined by stdout-path property in device tree's @@ -1338,7 +1339,7 @@ address must be provided, and the serial port must already be setup and configured. - earlyprintk= [X86,SH,ARM,M68k,S390] + earlyprintk= [X86,SH,ARM,M68k,S390,UM,EARLY] earlyprintk=vga earlyprintk=sclp earlyprintk=xen @@ -1396,7 +1397,7 @@ edd= [EDD] Format: {"off" | "on" | "skip[mbr]"} - efi= [EFI] + efi= [EFI,EARLY] Format: { "debug", "disable_early_pci_dma", "nochunk", "noruntime", "nosoftreserve", "novamap", "no_disable_early_pci_dma" } @@ -1417,13 +1418,13 @@ no_disable_early_pci_dma: Leave the busmaster bit set on all PCI bridges while in the EFI boot stub - efi_no_storage_paranoia [EFI; X86] + efi_no_storage_paranoia [EFI,X86,EARLY] Using this parameter you can use more than 50% of your efi variable storage. Use this parameter only if you are really sure that your UEFI does sane gc and fulfills the spec otherwise your board may brick. - efi_fake_mem= nn[KMG]@ss[KMG]:aa[,nn[KMG]@ss[KMG]:aa,..] [EFI; X86] + efi_fake_mem= nn[KMG]@ss[KMG]:aa[,nn[KMG]@ss[KMG]:aa,..] [EFI,X86,EARLY] Add arbitrary attribute to specific memory range by updating original EFI memory map. Region of memory which aa attribute is added to is @@ -1454,7 +1455,7 @@ eisa_irq_edge= [PARISC,HW] See header of drivers/parisc/eisa.c. - ekgdboc= [X86,KGDB] Allow early kernel console debugging + ekgdboc= [X86,KGDB,EARLY] Allow early kernel console debugging Format: ekgdboc=kbd This is designed to be used in conjunction with @@ -1469,13 +1470,13 @@ See comment before function elanfreq_setup() in arch/x86/kernel/cpu/cpufreq/elanfreq.c. - elfcorehdr=[size[KMG]@]offset[KMG] [PPC,SH,X86,S390] + elfcorehdr=[size[KMG]@]offset[KMG] [PPC,SH,X86,S390,EARLY] Specifies physical address of start of kernel core image elf header and optionally the size. Generally kexec loader will pass this option to capture kernel. See Documentation/admin-guide/kdump/kdump.rst for details. - enable_mtrr_cleanup [X86] + enable_mtrr_cleanup [X86,EARLY] The kernel tries to adjust MTRR layout from continuous to discrete, to make X server driver able to add WB entry later. This parameter enables that. @@ -1508,7 +1509,7 @@ Permit 'security.evm' to be updated regardless of current integrity status. - early_page_ext [KNL] Enforces page_ext initialization to earlier + early_page_ext [KNL,EARLY] Enforces page_ext initialization to earlier stages so cover more early boot allocations. Please note that as side effect some optimizations might be disabled to achieve that (e.g. parallelized @@ -1600,7 +1601,7 @@ can be changed at run time by the max_graph_depth file in the tracefs tracing directory. default: 0 (no limit) - fw_devlink= [KNL] Create device links between consumer and supplier + fw_devlink= [KNL,EARLY] Create device links between consumer and supplier devices by scanning the firmware to infer the consumer/supplier relationships. This feature is especially useful when drivers are loaded as modules as @@ -1619,12 +1620,12 @@ rpm -- Like "on", but also use to order runtime PM. fw_devlink.strict= - [KNL] Treat all inferred dependencies as mandatory + [KNL,EARLY] Treat all inferred dependencies as mandatory dependencies. This only applies for fw_devlink=on|rpm. Format: fw_devlink.sync_state = - [KNL] When all devices that could probe have finished + [KNL,EARLY] When all devices that could probe have finished probing, this parameter controls what to do with devices that haven't yet received their sync_state() calls. @@ -1645,12 +1646,12 @@ gamma= [HW,DRM] - gart_fix_e820= [X86-64] disable the fix e820 for K8 GART + gart_fix_e820= [X86-64,EARLY] disable the fix e820 for K8 GART Format: off | on default: on gather_data_sampling= - [X86,INTEL] Control the Gather Data Sampling (GDS) + [X86,INTEL,EARLY] Control the Gather Data Sampling (GDS) mitigation. Gather Data Sampling is a hardware vulnerability which @@ -1748,7 +1749,7 @@ (that will set all pages holding image data during restoration read-only). - highmem=nn[KMG] [KNL,BOOT] forces the highmem zone to have an exact + highmem=nn[KMG] [KNL,BOOT,EARLY] forces the highmem zone to have an exact size of . This works even on boxes that have no highmem otherwise. This also works to reduce highmem size on bigger boxes. @@ -1759,7 +1760,7 @@ hlt [BUGS=ARM,SH] - hostname= [KNL] Set the hostname (aka UTS nodename). + hostname= [KNL,EARLY] Set the hostname (aka UTS nodename). Format: This allows setting the system's hostname during early startup. This sets the name returned by gethostname. @@ -1804,7 +1805,7 @@ Documentation/admin-guide/mm/hugetlbpage.rst. Format: size[KMG] - hugetlb_cma= [HW,CMA] The size of a CMA area used for allocation + hugetlb_cma= [HW,CMA,EARLY] The size of a CMA area used for allocation of gigantic hugepages. Or using node format, the size of a CMA area per node can be specified. Format: nn[KMGTPE] or (node format) @@ -1850,9 +1851,10 @@ If specified, z/VM IUCV HVC accepts connections from listed z/VM user IDs only. - hv_nopvspin [X86,HYPER_V] Disables the paravirt spinlock optimizations - which allow the hypervisor to 'idle' the - guest on lock contention. + hv_nopvspin [X86,HYPER_V,EARLY] + Disables the paravirt spinlock optimizations + which allow the hypervisor to 'idle' the guest + on lock contention. i2c_bus= [HW] Override the default board specific I2C bus speed or register an additional I2C bus that is not @@ -1917,7 +1919,7 @@ Format: [,[,[,]]] - idle= [X86] + idle= [X86,EARLY] Format: idle=poll, idle=halt, idle=nomwait Poll forces a polling idle loop that can slightly improve the performance of waking up a idle CPU, but @@ -1973,7 +1975,7 @@ mode generally follows that for the NaN encoding, except where unsupported by hardware. - ignore_loglevel [KNL] + ignore_loglevel [KNL,EARLY] Ignore loglevel setting - this will print /all/ kernel messages to the console. Useful for debugging. We also add it as printk module parameter, so users @@ -2091,21 +2093,21 @@ unpacking being completed before device_ and late_ initcalls. - initrd= [BOOT] Specify the location of the initial ramdisk + initrd= [BOOT,EARLY] Specify the location of the initial ramdisk - initrdmem= [KNL] Specify a physical address and size from which to + initrdmem= [KNL,EARLY] Specify a physical address and size from which to load the initrd. If an initrd is compiled in or specified in the bootparams, it takes priority over this setting. Format: ss[KMG],nn[KMG] Default is 0, 0 - init_on_alloc= [MM] Fill newly allocated pages and heap objects with + init_on_alloc= [MM,EARLY] Fill newly allocated pages and heap objects with zeroes. Format: 0 | 1 Default set by CONFIG_INIT_ON_ALLOC_DEFAULT_ON. - init_on_free= [MM] Fill freed pages and heap objects with zeroes. + init_on_free= [MM,EARLY] Fill freed pages and heap objects with zeroes. Format: 0 | 1 Default set by CONFIG_INIT_ON_FREE_DEFAULT_ON. @@ -2161,7 +2163,7 @@ 0 disables intel_idle and fall back on acpi_idle. 1 to 9 specify maximum depth of C-state. - intel_pstate= [X86] + intel_pstate= [X86,EARLY] disable Do not enable intel_pstate as the default scaling driver for the supported processors @@ -2205,7 +2207,7 @@ Allow per-logical-CPU P-State performance control limits using cpufreq sysfs interface - intremap= [X86-64, Intel-IOMMU] + intremap= [X86-64,Intel-IOMMU,EARLY] on enable Interrupt Remapping (default) off disable Interrupt Remapping nosid disable Source ID checking @@ -2217,7 +2219,7 @@ strict regions from userspace. relaxed - iommu= [X86] + iommu= [X86,EARLY] off force noforce @@ -2232,7 +2234,7 @@ nobypass [PPC/POWERNV] Disable IOMMU bypass, using IOMMU for PCI devices. - iommu.forcedac= [ARM64, X86] Control IOVA allocation for PCI devices. + iommu.forcedac= [ARM64,X86,EARLY] Control IOVA allocation for PCI devices. Format: { "0" | "1" } 0 - Try to allocate a 32-bit DMA address first, before falling back to the full range if needed. @@ -2240,7 +2242,7 @@ forcing Dual Address Cycle for PCI cards supporting greater than 32-bit addressing. - iommu.strict= [ARM64, X86, S390] Configure TLB invalidation behaviour + iommu.strict= [ARM64,X86,S390,EARLY] Configure TLB invalidation behaviour Format: { "0" | "1" } 0 - Lazy mode. Request that DMA unmap operations use deferred @@ -2256,7 +2258,7 @@ legacy driver-specific options takes precedence. iommu.passthrough= - [ARM64, X86] Configure DMA to bypass the IOMMU by default. + [ARM64,X86,EARLY] Configure DMA to bypass the IOMMU by default. Format: { "0" | "1" } 0 - Use IOMMU translation for DMA. 1 - Bypass the IOMMU for DMA. @@ -2266,7 +2268,7 @@ See comment before marvel_specify_io7 in arch/alpha/kernel/core_marvel.c. - io_delay= [X86] I/O delay method + io_delay= [X86,EARLY] I/O delay method 0x80 Standard port 0x80 based delay 0xed @@ -2279,28 +2281,28 @@ ip= [IP_PNP] See Documentation/admin-guide/nfs/nfsroot.rst. - ipcmni_extend [KNL] Extend the maximum number of unique System V + ipcmni_extend [KNL,EARLY] Extend the maximum number of unique System V IPC identifiers from 32,768 to 16,777,216. irqaffinity= [SMP] Set the default irq affinity mask The argument is a cpu list, as described above. irqchip.gicv2_force_probe= - [ARM, ARM64] + [ARM,ARM64,EARLY] Format: Force the kernel to look for the second 4kB page of a GICv2 controller even if the memory range exposed by the device tree is too small. irqchip.gicv3_nolpi= - [ARM, ARM64] + [ARM,ARM64,EARLY] Force the kernel to ignore the availability of LPIs (and by consequence ITSs). Intended for system that use the kernel as a bootloader, and thus want to let secondary kernels in charge of setting up LPIs. - irqchip.gicv3_pseudo_nmi= [ARM64] + irqchip.gicv3_pseudo_nmi= [ARM64,EARLY] Enables support for pseudo-NMIs in the kernel. This requires the kernel to be built with CONFIG_ARM64_PSEUDO_NMI. @@ -2445,7 +2447,7 @@ parameter KASAN will print report only for the first invalid access. - keep_bootcon [KNL] + keep_bootcon [KNL,EARLY] Do not unregister boot console at start. This is only useful for debugging when something happens in the window between unregistering the boot console and initializing @@ -2453,7 +2455,7 @@ keepinitrd [HW,ARM] See retain_initrd. - kernelcore= [KNL,X86,IA-64,PPC] + kernelcore= [KNL,X86,IA-64,PPC,EARLY] Format: nn[KMGTPE] | nn% | "mirror" This parameter specifies the amount of memory usable by the kernel for non-movable allocations. The requested @@ -2478,7 +2480,7 @@ for Movable pages. "nn[KMGTPE]", "nn%", and "mirror" are exclusive, so you cannot specify multiple forms. - kgdbdbgp= [KGDB,HW] kgdb over EHCI usb debug port. + kgdbdbgp= [KGDB,HW,EARLY] kgdb over EHCI usb debug port. Format: [,poll interval] The controller # is the number of the ehci usb debug port as it is probed via PCI. The poll interval is @@ -2499,7 +2501,7 @@ kms, kbd format: kms,kbd kms, kbd and serial format: kms,kbd,[,baud] - kgdboc_earlycon= [KGDB,HW] + kgdboc_earlycon= [KGDB,HW,EARLY] If the boot console provides the ability to read characters and can work in polling mode, you can use this parameter to tell kgdb to use it as a backend @@ -2514,14 +2516,14 @@ blank and the first boot console that implements read() will be picked. - kgdbwait [KGDB] Stop kernel execution and enter the + kgdbwait [KGDB,EARLY] Stop kernel execution and enter the kernel debugger at the earliest opportunity. kmac= [MIPS] Korina ethernet MAC address. Configure the RouterBoard 532 series on-chip Ethernet adapter MAC address. - kmemleak= [KNL] Boot-time kmemleak enable/disable + kmemleak= [KNL,EARLY] Boot-time kmemleak enable/disable Valid arguments: on, off Default: on Built with CONFIG_DEBUG_KMEMLEAK_DEFAULT_OFF=y, @@ -2540,8 +2542,8 @@ See also Documentation/trace/kprobetrace.rst "Kernel Boot Parameter" section. - kpti= [ARM64] Control page table isolation of user - and kernel address spaces. + kpti= [ARM64,EARLY] Control page table isolation of + user and kernel address spaces. Default: enabled on cores which need mitigation. 0: force disabled 1: force enabled @@ -2618,7 +2620,8 @@ for NPT. kvm-arm.mode= - [KVM,ARM] Select one of KVM/arm64's modes of operation. + [KVM,ARM,EARLY] Select one of KVM/arm64's modes of + operation. none: Forcefully disable KVM. @@ -2638,22 +2641,22 @@ used with extreme caution. kvm-arm.vgic_v3_group0_trap= - [KVM,ARM] Trap guest accesses to GICv3 group-0 + [KVM,ARM,EARLY] Trap guest accesses to GICv3 group-0 system registers kvm-arm.vgic_v3_group1_trap= - [KVM,ARM] Trap guest accesses to GICv3 group-1 + [KVM,ARM,EARLY] Trap guest accesses to GICv3 group-1 system registers kvm-arm.vgic_v3_common_trap= - [KVM,ARM] Trap guest accesses to GICv3 common + [KVM,ARM,EARLY] Trap guest accesses to GICv3 common system registers kvm-arm.vgic_v4_enable= - [KVM,ARM] Allow use of GICv4 for direct injection of - LPIs. + [KVM,ARM,EARLY] Allow use of GICv4 for direct + injection of LPIs. - kvm_cma_resv_ratio=n [PPC] + kvm_cma_resv_ratio=n [PPC,EARLY] Reserves given percentage from system memory area for contiguous memory allocation for KVM hash pagetable allocation. @@ -2706,7 +2709,7 @@ (enabled). Disable by KVM if hardware lacks support for it. - l1d_flush= [X86,INTEL] + l1d_flush= [X86,INTEL,EARLY] Control mitigation for L1D based snooping vulnerability. Certain CPUs are vulnerable to an exploit against CPU @@ -2723,7 +2726,7 @@ on - enable the interface for the mitigation - l1tf= [X86] Control mitigation of the L1TF vulnerability on + l1tf= [X86,EARLY] Control mitigation of the L1TF vulnerability on affected CPUs The kernel PTE inversion protection is unconditionally @@ -2792,7 +2795,7 @@ l3cr= [PPC] - lapic [X86-32,APIC] Enable the local APIC even if BIOS + lapic [X86-32,APIC,EARLY] Enable the local APIC even if BIOS disabled it. lapic= [X86,APIC] Do not use TSC deadline @@ -2800,7 +2803,7 @@ back to the programmable timer unit in the LAPIC. Format: notscdeadline - lapic_timer_c2_ok [X86,APIC] trust the local apic timer + lapic_timer_c2_ok [X86,APIC,EARLY] trust the local apic timer in C2 power state. libata.dma= [LIBATA] DMA control @@ -2924,7 +2927,7 @@ lockd.nlm_udpport=M [NFS] Assign UDP port. Format: - lockdown= [SECURITY] + lockdown= [SECURITY,EARLY] { integrity | confidentiality } Enable the kernel lockdown feature. If set to integrity, kernel features that allow userland to @@ -3031,7 +3034,8 @@ logibm.irq= [HW,MOUSE] Logitech Bus Mouse Driver Format: - loglevel= All Kernel Messages with a loglevel smaller than the + loglevel= [KNL,EARLY] + All Kernel Messages with a loglevel smaller than the console loglevel will be printed to the console. It can also be changed with klogd or other programs. The loglevels are defined as follows: @@ -3045,13 +3049,15 @@ 6 (KERN_INFO) informational 7 (KERN_DEBUG) debug-level messages - log_buf_len=n[KMG] Sets the size of the printk ring buffer, - in bytes. n must be a power of two and greater - than the minimal size. The minimal size is defined - by LOG_BUF_SHIFT kernel config parameter. There is - also CONFIG_LOG_CPU_MAX_BUF_SHIFT config parameter - that allows to increase the default size depending on - the number of CPUs. See init/Kconfig for more details. + log_buf_len=n[KMG] [KNL,EARLY] + Sets the size of the printk ring buffer, in bytes. + n must be a power of two and greater than the + minimal size. The minimal size is defined by + LOG_BUF_SHIFT kernel config parameter. There + is also CONFIG_LOG_CPU_MAX_BUF_SHIFT config + parameter that allows to increase the default size + depending on the number of CPUs. See init/Kconfig + for more details. logo.nologo [FB] Disables display of the built-in Linux logo. This may be used to provide more screen space for @@ -3109,7 +3115,7 @@ max_addr=nn[KMG] [KNL,BOOT,IA-64] All physical memory greater than or equal to this physical address is ignored. - maxcpus= [SMP] Maximum number of processors that an SMP kernel + maxcpus= [SMP,EARLY] Maximum number of processors that an SMP kernel will bring up during bootup. maxcpus=n : n >= 0 limits the kernel to bring up 'n' processors. Surely after bootup you can bring up the other plugged cpu by executing @@ -3136,7 +3142,7 @@ Format: , Specifies range of consoles to be captured by the MDA. - mds= [X86,INTEL] + mds= [X86,INTEL,EARLY] Control mitigation for the Micro-architectural Data Sampling (MDS) vulnerability. @@ -3168,11 +3174,12 @@ For details see: Documentation/admin-guide/hw-vuln/mds.rst - mem=nn[KMG] [HEXAGON] Set the memory size. + mem=nn[KMG] [HEXAGON,EARLY] Set the memory size. Must be specified, otherwise memory size will be 0. - mem=nn[KMG] [KNL,BOOT] Force usage of a specific amount of memory - Amount of memory to be used in cases as follows: + mem=nn[KMG] [KNL,BOOT,EARLY] Force usage of a specific amount + of memory Amount of memory to be used in cases + as follows: 1 for test; 2 when the kernel is not able to see the whole system memory; @@ -3196,8 +3203,8 @@ if system memory of hypervisor is not sufficient. mem=nn[KMG]@ss[KMG] - [ARM,MIPS] - override the memory layout reported by - firmware. + [ARM,MIPS,EARLY] - override the memory layout + reported by firmware. Define a memory region of size nn[KMG] starting at ss[KMG]. Multiple different regions can be specified with @@ -3206,7 +3213,7 @@ mem=nopentium [BUGS=X86-32] Disable usage of 4MB pages for kernel memory. - memblock=debug [KNL] Enable memblock debug messages. + memblock=debug [KNL,EARLY] Enable memblock debug messages. memchunk=nn[KMG] [KNL,SH] Allow user to override the default size for @@ -3220,14 +3227,14 @@ option. See Documentation/admin-guide/mm/memory-hotplug.rst. - memmap=exactmap [KNL,X86] Enable setting of an exact + memmap=exactmap [KNL,X86,EARLY] Enable setting of an exact E820 memory map, as specified by the user. Such memmap=exactmap lines can be constructed based on BIOS output or other requirements. See the memmap=nn@ss option description. memmap=nn[KMG]@ss[KMG] - [KNL, X86, MIPS, XTENSA] Force usage of a specific region of memory. + [KNL, X86,MIPS,XTENSA,EARLY] Force usage of a specific region of memory. Region of memory to be used is from ss to ss+nn. If @ss[KMG] is omitted, it is equivalent to mem=nn[KMG], which limits max address to nn[KMG]. @@ -3237,11 +3244,11 @@ memmap=100M@2G,100M#3G,1G!1024G memmap=nn[KMG]#ss[KMG] - [KNL,ACPI] Mark specific memory as ACPI data. + [KNL,ACPI,EARLY] Mark specific memory as ACPI data. Region of memory to be marked is from ss to ss+nn. memmap=nn[KMG]$ss[KMG] - [KNL,ACPI] Mark specific memory as reserved. + [KNL,ACPI,EARLY] Mark specific memory as reserved. Region of memory to be reserved is from ss to ss+nn. Example: Exclude memory from 0x18690000-0x1869ffff memmap=64K$0x18690000 @@ -3251,14 +3258,14 @@ like Grub2, otherwise '$' and the following number will be eaten. - memmap=nn[KMG]!ss[KMG] + memmap=nn[KMG]!ss[KMG,EARLY] [KNL,X86] Mark specific memory as protected. Region of memory to be used, from ss to ss+nn. The memory region may be marked as e820 type 12 (0xc) and is NVDIMM or ADR memory. memmap=%-+ - [KNL,ACPI] Convert memory within the specified region + [KNL,ACPI,EARLY] Convert memory within the specified region from to . If "-" is left out, the whole region will be marked as , even if previously unavailable. If "+" is left @@ -3266,7 +3273,7 @@ specified as e820 types, e.g., 1 = RAM, 2 = reserved, 3 = ACPI, 12 = PRAM. - memory_corruption_check=0/1 [X86] + memory_corruption_check=0/1 [X86,EARLY] Some BIOSes seem to corrupt the first 64k of memory when doing things like suspend/resume. Setting this option will scan the memory @@ -3278,13 +3285,13 @@ affects the same memory, you can use memmap= to prevent the kernel from using that memory. - memory_corruption_check_size=size [X86] + memory_corruption_check_size=size [X86,EARLY] By default it checks for corruption in the low 64k, making this memory unavailable for normal use. Use this parameter to scan for corruption in more or less memory. - memory_corruption_check_period=seconds [X86] + memory_corruption_check_period=seconds [X86,EARLY] By default it checks for corruption every 60 seconds. Use this parameter to check at some other rate. 0 disables periodic checking. @@ -3308,7 +3315,7 @@ Note that even when enabled, there are a few cases where the feature is not effective. - memtest= [KNL,X86,ARM,M68K,PPC,RISCV] Enable memtest + memtest= [KNL,X86,ARM,M68K,PPC,RISCV,EARLY] Enable memtest Format: default : 0 Specifies the number of memtest passes to be @@ -3376,7 +3383,7 @@ https://repo.or.cz/w/linux-2.6/mini2440.git mitigations= - [X86,PPC,S390,ARM64] Control optional mitigations for + [X86,PPC,S390,ARM64,EARLY] Control optional mitigations for CPU vulnerabilities. This is a set of curated, arch-independent options, each of which is an aggregation of existing arch-specific options. @@ -3429,7 +3436,7 @@ retbleed=auto,nosmt [X86] mminit_loglevel= - [KNL] When CONFIG_DEBUG_MEMORY_INIT is set, this + [KNL,EARLY] When CONFIG_DEBUG_MEMORY_INIT is set, this parameter allows control of the logging verbosity for the additional memory initialisation checks. A value of 0 disables mminit logging and a level of 4 will @@ -3437,7 +3444,7 @@ so loglevel=8 may also need to be specified. mmio_stale_data= - [X86,INTEL] Control mitigation for the Processor + [X86,INTEL,EARLY] Control mitigation for the Processor MMIO Stale Data vulnerabilities. Processor MMIO Stale Data is a class of @@ -3512,7 +3519,7 @@ mousedev.yres= [MOUSE] Vertical screen resolution, used for devices reporting absolute coordinates, such as tablets - movablecore= [KNL,X86,IA-64,PPC] + movablecore= [KNL,X86,IA-64,PPC,EARLY] Format: nn[KMGTPE] | nn% This parameter is the complement to kernelcore=, it specifies the amount of memory used for migratable @@ -3523,7 +3530,7 @@ that the amount of memory usable for all allocations is not too small. - movable_node [KNL] Boot-time switch to make hotplugable memory + movable_node [KNL,EARLY] Boot-time switch to make hotplugable memory NUMA nodes to be movable. This means that the memory of such nodes will be usable only for movable allocations which rules out almost all kernel @@ -3547,21 +3554,21 @@ [HW] Make the MicroTouch USB driver use raw coordinates ('y', default) or cooked coordinates ('n') - mtrr=debug [X86] + mtrr=debug [X86,EARLY] Enable printing debug information related to MTRR registers at boot time. - mtrr_chunk_size=nn[KMG] [X86] + mtrr_chunk_size=nn[KMG,X86,EARLY] used for mtrr cleanup. It is largest continuous chunk that could hold holes aka. UC entries. - mtrr_gran_size=nn[KMG] [X86] + mtrr_gran_size=nn[KMG,X86,EARLY] Used for mtrr cleanup. It is granularity of mtrr block. Default is 1. Large value could prevent small alignment from using up MTRRs. - mtrr_spare_reg_nr=n [X86] + mtrr_spare_reg_nr=n [X86,EARLY] Format: Range: 0,7 : spare reg number Default : 1 @@ -3747,10 +3754,10 @@ emulation library even if a 387 maths coprocessor is present. - no4lvl [RISCV] Disable 4-level and 5-level paging modes. Forces - kernel to use 3-level paging instead. + no4lvl [RISCV,EARLY] Disable 4-level and 5-level paging modes. + Forces kernel to use 3-level paging instead. - no5lvl [X86-64,RISCV] Disable 5-level paging mode. Forces + no5lvl [X86-64,RISCV,EARLY] Disable 5-level paging mode. Forces kernel to use 4-level paging instead. noaliencache [MM, NUMA, SLAB] Disables the allocation of alien @@ -3759,15 +3766,15 @@ noalign [KNL,ARM] - noaltinstr [S390] Disables alternative instructions patching - (CPU alternatives feature). + noaltinstr [S390,EARLY] Disables alternative instructions + patching (CPU alternatives feature). - noapic [SMP,APIC] Tells the kernel to not make use of any + noapic [SMP,APIC,EARLY] Tells the kernel to not make use of any IOAPICs that may be present in the system. noautogroup Disable scheduler automatic task group creation. - nocache [ARM] + nocache [ARM,EARLY] no_console_suspend [HW] Never suspend the console @@ -3785,13 +3792,13 @@ turn on/off it dynamically. no_debug_objects - [KNL] Disable object debugging + [KNL,EARLY] Disable object debugging nodsp [SH] Disable hardware DSP at boot time. - noefi Disable EFI runtime services support. + noefi [EFI,EARLY] Disable EFI runtime services support. - no_entry_flush [PPC] Don't flush the L1-D cache when entering the kernel. + no_entry_flush [PPC,EARLY] Don't flush the L1-D cache when entering the kernel. noexec [IA-64] @@ -3822,6 +3829,7 @@ real-time systems. no_hash_pointers + [KNL,EARLY] Force pointers printed to the console or buffers to be unhashed. By default, when a pointer is printed via %p format string, that pointer is "hashed", i.e. obscured @@ -3846,9 +3854,9 @@ the impact of the sleep instructions. This is also useful when using JTAG debugger. - nohugeiomap [KNL,X86,PPC,ARM64] Disable kernel huge I/O mappings. + nohugeiomap [KNL,X86,PPC,ARM64,EARLY] Disable kernel huge I/O mappings. - nohugevmalloc [KNL,X86,PPC,ARM64] Disable kernel huge vmalloc mappings. + nohugevmalloc [KNL,X86,PPC,ARM64,EARLY] Disable kernel huge vmalloc mappings. nohz= [KNL] Boottime enable/disable dynamic ticks Valid arguments: on, off @@ -3870,13 +3878,13 @@ noinitrd [RAM] Tells the kernel not to load any configured initial RAM disk. - nointremap [X86-64, Intel-IOMMU] Do not enable interrupt + nointremap [X86-64,Intel-IOMMU,EARLY] Do not enable interrupt remapping. [Deprecated - use intremap=off] nointroute [IA-64] - noinvpcid [X86] Disable the INVPCID cpu feature. + noinvpcid [X86,EARLY] Disable the INVPCID cpu feature. noiotrap [SH] Disables trapped I/O port accesses. @@ -3887,19 +3895,19 @@ nojitter [IA-64] Disables jitter checking for ITC timers. - nokaslr [KNL] + nokaslr [KNL,EARLY] When CONFIG_RANDOMIZE_BASE is set, this disables kernel and module base offset ASLR (Address Space Layout Randomization). - no-kvmapf [X86,KVM] Disable paravirtualized asynchronous page + no-kvmapf [X86,KVM,EARLY] Disable paravirtualized asynchronous page fault handling. - no-kvmclock [X86,KVM] Disable paravirtualized KVM clock driver + no-kvmclock [X86,KVM,EARLY] Disable paravirtualized KVM clock driver - nolapic [X86-32,APIC] Do not enable or use the local APIC. + nolapic [X86-32,APIC,EARLY] Do not enable or use the local APIC. - nolapic_timer [X86-32,APIC] Do not use the local APIC timer. + nolapic_timer [X86-32,APIC,EARLY] Do not use the local APIC timer. nomca [IA-64] Disable machine check abort handling @@ -3924,23 +3932,23 @@ shutdown the other cpus. Instead use the REBOOT_VECTOR irq. - nopat [X86] Disable PAT (page attribute table extension of + nopat [X86,EARLY] Disable PAT (page attribute table extension of pagetables) support. - nopcid [X86-64] Disable the PCID cpu feature. + nopcid [X86-64,EARLY] Disable the PCID cpu feature. nopku [X86] Disable Memory Protection Keys CPU feature found in some Intel CPUs. - nopti [X86-64] + nopti [X86-64,EARLY] Equivalent to pti=off - nopv= [X86,XEN,KVM,HYPER_V,VMWARE] + nopv= [X86,XEN,KVM,HYPER_V,VMWARE,EARLY] Disables the PV optimizations forcing the guest to run as generic guest with no PV drivers. Currently support XEN HVM, KVM, HYPER_V and VMWARE guest. - nopvspin [X86,XEN,KVM] + nopvspin [X86,XEN,KVM,EARLY] Disables the qspinlock slow path using PV optimizations which allow the hypervisor to 'idle' the guest on lock contention. @@ -3960,20 +3968,20 @@ This is required for the Braillex ib80-piezo Braille reader made by F.H. Papenmeier (Germany). - nosgx [X86-64,SGX] Disables Intel SGX kernel support. + nosgx [X86-64,SGX,EARLY] Disables Intel SGX kernel support. - nosmap [PPC] + nosmap [PPC,EARLY] Disable SMAP (Supervisor Mode Access Prevention) even if it is supported by processor. - nosmep [PPC64s] + nosmep [PPC64s,EARLY] Disable SMEP (Supervisor Mode Execution Prevention) even if it is supported by processor. - nosmp [SMP] Tells an SMP kernel to act as a UP kernel, + nosmp [SMP,EARLY] Tells an SMP kernel to act as a UP kernel, and disable the IO APIC. legacy for "maxcpus=0". - nosmt [KNL,MIPS,PPC,S390] Disable symmetric multithreading (SMT). + nosmt [KNL,MIPS,PPC,S390,EARLY] Disable symmetric multithreading (SMT). Equivalent to smt=1. [KNL,X86,PPC] Disable symmetric multithreading (SMT). @@ -3983,22 +3991,23 @@ nosoftlockup [KNL] Disable the soft-lockup detector. nospec_store_bypass_disable - [HW] Disable all mitigations for the Speculative Store Bypass vulnerability + [HW,EARLY] Disable all mitigations for the Speculative + Store Bypass vulnerability - nospectre_bhb [ARM64] Disable all mitigations for Spectre-BHB (branch + nospectre_bhb [ARM64,EARLY] Disable all mitigations for Spectre-BHB (branch history injection) vulnerability. System may allow data leaks with this option. - nospectre_v1 [X86,PPC] Disable mitigations for Spectre Variant 1 + nospectre_v1 [X86,PPC,EARLY] Disable mitigations for Spectre Variant 1 (bounds check bypass). With this option data leaks are possible in the system. - nospectre_v2 [X86,PPC_E500,ARM64] Disable all mitigations for - the Spectre variant 2 (indirect branch prediction) - vulnerability. System may allow data leaks with this - option. + nospectre_v2 [X86,PPC_E500,ARM64,EARLY] Disable all mitigations + for the Spectre variant 2 (indirect branch + prediction) vulnerability. System may allow data + leaks with this option. - no-steal-acc [X86,PV_OPS,ARM64,PPC/PSERIES,RISCV] Disable + no-steal-acc [X86,PV_OPS,ARM64,PPC/PSERIES,RISCV,EARLY] Disable paravirtualized steal time accounting. steal time is computed, but won't influence scheduler behaviour @@ -4008,7 +4017,7 @@ broken timer IRQ sources. no_uaccess_flush - [PPC] Don't flush the L1-D cache after accessing user data. + [PPC,EARLY] Don't flush the L1-D cache after accessing user data. novmcoredd [KNL,KDUMP] Disable device dump. Device dump allows drivers to @@ -4022,15 +4031,15 @@ is set. no-vmw-sched-clock - [X86,PV_OPS] Disable paravirtualized VMware scheduler - clock and use the default one. + [X86,PV_OPS,EARLY] Disable paravirtualized VMware + scheduler clock and use the default one. nowatchdog [KNL] Disable both lockup detectors, i.e. soft-lockup and NMI watchdog (hard-lockup). - nowb [ARM] + nowb [ARM,EARLY] - nox2apic [X86-64,APIC] Do not enable x2APIC mode. + nox2apic [X86-64,APIC,EARLY] Do not enable x2APIC mode. NOTE: this parameter will be ignored on systems with the LEGACY_XAPIC_DISABLED bit set in the @@ -4068,7 +4077,7 @@ purges which is reported from either PAL_VM_SUMMARY or SAL PALO. - nr_cpus= [SMP] Maximum number of processors that an SMP kernel + nr_cpus= [SMP,EARLY] Maximum number of processors that an SMP kernel could support. nr_cpus=n : n >= 1 limits the kernel to support 'n' processors. It could be larger than the number of already plugged CPU during bootup, later in @@ -4079,8 +4088,9 @@ nr_uarts= [SERIAL] maximum number of UARTs to be registered. - numa=off [KNL, ARM64, PPC, RISCV, SPARC, X86] Disable NUMA, Only - set up a single NUMA node spanning all memory. + numa=off [KNL, ARM64, PPC, RISCV, SPARC, X86, EARLY] + Disable NUMA, Only set up a single NUMA node + spanning all memory. numa_balancing= [KNL,ARM64,PPC,RISCV,S390,X86] Enable or disable automatic NUMA balancing. @@ -4091,7 +4101,7 @@ This can be set from sysctl after boot. See Documentation/admin-guide/sysctl/vm.rst for details. - ohci1394_dma=early [HW] enable debugging via the ohci1394 driver. + ohci1394_dma=early [HW,EARLY] enable debugging via the ohci1394 driver. See Documentation/core-api/debugging-via-ohci1394.rst for more info. @@ -4117,7 +4127,8 @@ Once locked, the boundary cannot be changed. 1 indicates lock status, 0 indicates unlock status. - oops=panic Always panic on oopses. Default is to just kill the + oops=panic [KNL,EARLY] + Always panic on oopses. Default is to just kill the process, but there is a small probability of deadlocking the machine. This will also cause panics on machine check exceptions. @@ -4133,13 +4144,13 @@ can be read from sysfs at: /sys/module/page_alloc/parameters/shuffle. - page_owner= [KNL] Boot-time page_owner enabling option. + page_owner= [KNL,EARLY] Boot-time page_owner enabling option. Storage of the information about who allocated each page is disabled in default. With this switch, we can turn it on. on: enable the feature - page_poison= [KNL] Boot-time parameter changing the state of + page_poison= [KNL,EARLY] Boot-time parameter changing the state of poisoning on the buddy allocator, available with CONFIG_PAGE_POISONING=y. off: turn off poisoning (default) @@ -4157,7 +4168,8 @@ timeout < 0: reboot immediately Format: - panic_on_taint= Bitmask for conditionally calling panic() in add_taint() + panic_on_taint= [KNL,EARLY] + Bitmask for conditionally calling panic() in add_taint() Format: [,nousertaint] Hexadecimal bitmask representing the set of TAINT flags that will cause the kernel to panic when add_taint() is @@ -4313,7 +4325,7 @@ pcbit= [HW,ISDN] - pci=option[,option...] [PCI] various PCI subsystem options. + pci=option[,option...] [PCI,EARLY] various PCI subsystem options. Some options herein operate on a specific device or a set of devices (). These are @@ -4582,7 +4594,8 @@ Format: { 0 | 1 } See arch/parisc/kernel/pdc_chassis.c - percpu_alloc= Select which percpu first chunk allocator to use. + percpu_alloc= [MM,EARLY] + Select which percpu first chunk allocator to use. Currently supported values are "embed" and "page". Archs may support subset or none of the selections. See comments in mm/percpu.c for details on each @@ -4651,12 +4664,12 @@ execution priority. ppc_strict_facility_enable - [PPC] This option catches any kernel floating point, + [PPC,ENABLE] This option catches any kernel floating point, Altivec, VSX and SPE outside of regions specifically allowed (eg kernel_enable_fpu()/kernel_disable_fpu()). There is some performance impact when enabling this. - ppc_tm= [PPC] + ppc_tm= [PPC,EARLY] Format: {"off"} Disable Hardware Transactional Memory @@ -4766,7 +4779,7 @@ [KNL] Number of legacy pty's. Overwrites compiled-in default number. - quiet [KNL] Disable most log messages + quiet [KNL,EARLY] Disable most log messages r128= [HW,DRM] @@ -4783,17 +4796,17 @@ ramdisk_start= [RAM] RAM disk image start address random.trust_cpu=off - [KNL] Disable trusting the use of the CPU's + [KNL,EARLY] Disable trusting the use of the CPU's random number generator (if available) to initialize the kernel's RNG. random.trust_bootloader=off - [KNL] Disable trusting the use of the a seed + [KNL,EARLY] Disable trusting the use of the a seed passed by the bootloader (if available) to initialize the kernel's RNG. randomize_kstack_offset= - [KNL] Enable or disable kernel stack offset + [KNL,EARLY] Enable or disable kernel stack offset randomization, which provides roughly 5 bits of entropy, frustrating memory corruption attacks that depend on stack address determinism or @@ -5484,7 +5497,7 @@ Run specified binary instead of /init from the ramdisk, used for early userspace startup. See initrd. - rdrand= [X86] + rdrand= [X86,EARLY] force - Override the decision by the kernel to hide the advertisement of RDRAND support (this affects certain AMD processors because of buggy BIOS @@ -5580,7 +5593,7 @@ them. If is less than 0x10000, the region is assumed to be I/O ports; otherwise it is memory. - reservetop= [X86-32] + reservetop= [X86-32,EARLY] Format: nn[KMG] Reserves a hole at the top of the kernel virtual address space. @@ -5665,7 +5678,7 @@ [KNL] Disable ring 3 MONITOR/MWAIT feature on supported CPUs. - riscv_isa_fallback [RISCV] + riscv_isa_fallback [RISCV,EARLY] When CONFIG_RISCV_ISA_FALLBACK is not enabled, permit falling back to detecting extension support by parsing "riscv,isa" property on devicetree systems when the @@ -5674,13 +5687,14 @@ ro [KNL] Mount root device read-only on boot - rodata= [KNL] + rodata= [KNL,EARLY] on Mark read-only kernel memory as read-only (default). off Leave read-only kernel memory writable for debugging. full Mark read-only kernel memory and aliases as read-only [arm64] rockchip.usb_uart + [EARLY] Enable the uart passthrough on the designated usb port on Rockchip SoCs. When active, the signals of the debug-uart get routed to the D+ and D- pins of the usb @@ -5741,7 +5755,7 @@ sa1100ir [NET] See drivers/net/irda/sa1100_ir.c. - sched_verbose [KNL] Enables verbose scheduler debug messages. + sched_verbose [KNL,EARLY] Enables verbose scheduler debug messages. schedstats= [KNL,X86] Enable or disable scheduled statistics. Allowed values are enable and disable. This feature @@ -5856,7 +5870,7 @@ non-zero "wait" parameter. See weight_single and weight_many. - skew_tick= [KNL] Offset the periodic timer tick per cpu to mitigate + skew_tick= [KNL,EARLY] Offset the periodic timer tick per cpu to mitigate xtime_lock contention on larger systems, and/or RCU lock contention on all systems with CONFIG_MAXSMP set. Format: { "0" | "1" } @@ -5987,10 +6001,10 @@ 1: Fast pin select (default) 2: ATC IRMode - smt= [KNL,MIPS,S390] Set the maximum number of threads (logical - CPUs) to use per physical CPU on systems capable of - symmetric multithreading (SMT). Will be capped to the - actual hardware limit. + smt= [KNL,MIPS,S390,EARLY] Set the maximum number of threads + (logical CPUs) to use per physical CPU on systems + capable of symmetric multithreading (SMT). Will + be capped to the actual hardware limit. Format: Default: -1 (no limit) @@ -6012,7 +6026,7 @@ sonypi.*= [HW] Sony Programmable I/O Control Device driver See Documentation/admin-guide/laptops/sonypi.rst - spectre_v2= [X86] Control mitigation of Spectre variant 2 + spectre_v2= [X86,EARLY] Control mitigation of Spectre variant 2 (indirect branch speculation) vulnerability. The default operation protects the kernel from user space attacks. @@ -6092,7 +6106,7 @@ spectre_v2_user=auto. spec_rstack_overflow= - [X86] Control RAS overflow mitigation on AMD Zen CPUs + [X86,EARLY] Control RAS overflow mitigation on AMD Zen CPUs off - Disable mitigation microcode - Enable microcode mitigation only @@ -6103,7 +6117,7 @@ (cloud-specific mitigation) spec_store_bypass_disable= - [HW] Control Speculative Store Bypass (SSB) Disable mitigation + [HW,EARLY] Control Speculative Store Bypass (SSB) Disable mitigation (Speculative Store Bypass vulnerability) Certain CPUs are vulnerable to an exploit against a @@ -6199,7 +6213,7 @@ #DB exception for bus lock is triggered only when CPL > 0. - srbds= [X86,INTEL] + srbds= [X86,INTEL,EARLY] Control the Special Register Buffer Data Sampling (SRBDS) mitigation. @@ -6286,7 +6300,7 @@ srcutree.convert_to_big must have the 0x10 bit set for contention-based conversions to occur. - ssbd= [ARM64,HW] + ssbd= [ARM64,HW,EARLY] Speculative Store Bypass Disable control On CPUs that are vulnerable to the Speculative @@ -6310,7 +6324,7 @@ growing up) the main stack are reserved for no other mapping. Default value is 256 pages. - stack_depot_disable= [KNL] + stack_depot_disable= [KNL,EARLY] Setting this to true through kernel command line will disable the stack depot thereby saving the static memory consumed by the stack hash table. By default this is set @@ -6349,12 +6363,12 @@ be used to filter out binaries which have not yet been made aware of AT_MINSIGSTKSZ. - stress_hpt [PPC] + stress_hpt [PPC,EARLY] Limits the number of kernel HPT entries in the hash page table to increase the rate of hash page table faults on kernel addresses. - stress_slb [PPC] + stress_slb [PPC,EARLY] Limits the number of kernel SLB entries, and flushes them frequently to increase the rate of SLB faults on kernel addresses. @@ -6414,7 +6428,7 @@ This parameter controls use of the Protected Execution Facility on pSeries. - swiotlb= [ARM,IA-64,PPC,MIPS,X86] + swiotlb= [ARM,IA-64,PPC,MIPS,X86,EARLY] Format: { [,] | force | noforce } -- Number of I/O TLB slabs -- Second integer after comma. Number of swiotlb @@ -6424,7 +6438,7 @@ wouldn't be automatically used by the kernel noforce -- Never use bounce buffers (for debugging) - switches= [HW,M68k] + switches= [HW,M68k,EARLY] sysctl.*= [KNL] Set a sysctl parameter, right before loading the init @@ -6483,11 +6497,11 @@ : poll all this frequency 0: no polling (default) - threadirqs [KNL] + threadirqs [KNL,EARLY] Force threading of all interrupt handlers except those marked explicitly IRQF_NO_THREAD. - topology= [S390] + topology= [S390,EARLY] Format: {off | on} Specify if the kernel should make use of the cpu topology information if the hardware supports this. @@ -6728,7 +6742,7 @@ can be overridden by a later tsc=nowatchdog. A console message will flag any such suppression or overriding. - tsc_early_khz= [X86] Skip early TSC calibration and use the given + tsc_early_khz= [X86,EARLY] Skip early TSC calibration and use the given value instead. Useful when the early TSC frequency discovery procedure is not reliable, such as on overclocked systems with CPUID.16h support and partial CPUID.15h support. @@ -6763,7 +6777,7 @@ See Documentation/admin-guide/hw-vuln/tsx_async_abort.rst for more details. - tsx_async_abort= [X86,INTEL] Control mitigation for the TSX Async + tsx_async_abort= [X86,INTEL,EARLY] Control mitigation for the TSX Async Abort (TAA) vulnerability. Similar to Micro-architectural Data Sampling (MDS) @@ -6829,7 +6843,7 @@ unknown_nmi_panic [X86] Cause panic on unknown NMI. - unwind_debug [X86-64] + unwind_debug [X86-64,EARLY] Enable unwinder debug output. This can be useful for debugging certain unwinder error conditions, including corrupt stacks and @@ -7019,7 +7033,7 @@ Example: user_debug=31 userpte= - [X86] Flags controlling user PTE allocations. + [X86,EARLY] Flags controlling user PTE allocations. nohigh = do not allocate PTE pages in HIGHMEM regardless of setting @@ -7048,7 +7062,7 @@ vector= [IA-64,SMP] vector=percpu: enable percpu vector domain - video= [FB] Frame buffer configuration + video= [FB,EARLY] Frame buffer configuration See Documentation/fb/modedb.rst. video.brightness_switch_enabled= [ACPI] @@ -7096,13 +7110,13 @@ P Enable page structure init time poisoning - Disable all of the above options - vmalloc=nn[KMG] [KNL,BOOT] Forces the vmalloc area to have an exact - size of . This can be used to increase the - minimum size (128MB on x86). It can also be used to - decrease the size and leave more room for directly - mapped kernel RAM. + vmalloc=nn[KMG] [KNL,BOOT,EARLY] Forces the vmalloc area to have an + exact size of . This can be used to increase + the minimum size (128MB on x86). It can also be + used to decrease the size and leave more room + for directly mapped kernel RAM. - vmcp_cma=nn[MG] [KNL,S390] + vmcp_cma=nn[MG] [KNL,S390,EARLY] Sets the memory size reserved for contiguous memory allocations for the vmcp device driver. @@ -7115,7 +7129,7 @@ vmpoff= [KNL,S390] Perform z/VM CP command after power off. Format: - vsyscall= [X86-64] + vsyscall= [X86-64,EARLY] Controls the behavior of vsyscalls (i.e. calls to fixed addresses of 0xffffffffff600x00 from legacy code). Most statically-linked binaries and older @@ -7263,13 +7277,13 @@ When enabled, memory and cache locality will be impacted. - writecombine= [LOONGARCH] Control the MAT (Memory Access Type) of - ioremap_wc(). + writecombine= [LOONGARCH,EARLY] Control the MAT (Memory Access + Type) of ioremap_wc(). on - Enable writecombine, use WUC for ioremap_wc() off - Disable writecombine, use SUC for ioremap_wc() - x2apic_phys [X86-64,APIC] Use x2apic physical mode instead of + x2apic_phys [X86-64,APIC,EARLY] Use x2apic physical mode instead of default x2apic cluster mode on platforms supporting x2apic. @@ -7280,7 +7294,7 @@ save/restore/migration must be enabled to handle larger domains. - xen_emul_unplug= [HW,X86,XEN] + xen_emul_unplug= [HW,X86,XEN,EARLY] Unplug Xen emulated devices Format: [unplug0,][unplug1] ide-disks -- unplug primary master IDE devices @@ -7292,17 +7306,17 @@ the unplug protocol never -- do not unplug even if version check succeeds - xen_legacy_crash [X86,XEN] + xen_legacy_crash [X86,XEN,EARLY] Crash from Xen panic notifier, without executing late panic() code such as dumping handler. - xen_msr_safe= [X86,XEN] + xen_msr_safe= [X86,XEN,EARLY] Format: Select whether to always use non-faulting (safe) MSR access functions when running as Xen PV guest. The default value is controlled by CONFIG_XEN_PV_MSR_SAFE. - xen_nopvspin [X86,XEN] + xen_nopvspin [X86,XEN,EARLY] Disables the qspinlock slowpath using Xen PV optimizations. This parameter is obsoleted by "nopvspin" parameter, which has equivalent effect for XEN platform. @@ -7314,7 +7328,7 @@ has equivalent effect for XEN platform. xen_no_vector_callback - [KNL,X86,XEN] Disable the vector callback for Xen + [KNL,X86,XEN,EARLY] Disable the vector callback for Xen event channel interrupts. xen_scrub_pages= [XEN] @@ -7323,7 +7337,7 @@ with /sys/devices/system/xen_memory/xen_memory0/scrub_pages. Default value controlled with CONFIG_XEN_SCRUB_PAGES_DEFAULT. - xen_timer_slop= [X86-64,XEN] + xen_timer_slop= [X86-64,XEN,EARLY] Set the timer slop (in nanoseconds) for the virtual Xen timers (default is 100000). This adjusts the minimum delta of virtualized Xen timers, where lower values @@ -7376,7 +7390,7 @@ host controller quirks. Meaning of each bit can be consulted in header drivers/usb/host/xhci.h. - xmon [PPC] + xmon [PPC,EARLY] Format: { early | on | rw | ro | off } Controls if xmon debugger is enabled. Default is off. Passing only "xmon" is equivalent to "xmon=early". From b42cdd2cbe2b3f677b3eb484c6c6bb4fa10e269c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 4 Dec 2023 09:33:29 -0800 Subject: [PATCH 198/707] rcu-tasks: Repair RCU Tasks Trace quiescence check The context-switch-time check for RCU Tasks Trace quiescence expects current->trc_reader_special.b.need_qs to be zero, and if so, updates it to TRC_NEED_QS_CHECKED. This is backwards, because if this value is zero, there is no RCU Tasks Trace grace period in flight, an thus no need for a quiescent state. Instead, when a grace period starts, this field is set to TRC_NEED_QS. This commit therefore changes the check from zero to TRC_NEED_QS. Reported-by: Steven Rostedt Signed-off-by: Paul E. McKenney Tested-by: Steven Rostedt (Google) --- include/linux/rcupdate.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 0746b1b0b6639d..16f519914415eb 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -184,9 +184,9 @@ void rcu_tasks_trace_qs_blkd(struct task_struct *t); do { \ int ___rttq_nesting = READ_ONCE((t)->trc_reader_nesting); \ \ - if (likely(!READ_ONCE((t)->trc_reader_special.b.need_qs)) && \ + if (unlikely(READ_ONCE((t)->trc_reader_special.b.need_qs) == TRC_NEED_QS) && \ likely(!___rttq_nesting)) { \ - rcu_trc_cmpxchg_need_qs((t), 0, TRC_NEED_QS_CHECKED); \ + rcu_trc_cmpxchg_need_qs((t), TRC_NEED_QS, TRC_NEED_QS_CHECKED); \ } else if (___rttq_nesting && ___rttq_nesting != INT_MIN && \ !READ_ONCE((t)->trc_reader_special.b.blocked)) { \ rcu_tasks_trace_qs_blkd(t); \ From 43824241662ac61e90a023f908f267e6b137626b Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 15 Nov 2023 14:11:26 -0500 Subject: [PATCH 199/707] rcu: Rename jiffies_till_flush to jiffies_lazy_flush The variable name jiffies_till_flush is too generic and therefore: * It may shadow a global variable * It doesn't tell on what it operates Make the name more precise, along with the related APIs. Reviewed-by: Joel Fernandes (Google) Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu.h | 8 ++++---- kernel/rcu/rcuscale.c | 6 +++--- kernel/rcu/tree_nocb.h | 22 +++++++++++----------- 3 files changed, 18 insertions(+), 18 deletions(-) diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index f94f65877f2b68..dcfb666f24993f 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -543,11 +543,11 @@ enum rcutorture_type { }; #if defined(CONFIG_RCU_LAZY) -unsigned long rcu_lazy_get_jiffies_till_flush(void); -void rcu_lazy_set_jiffies_till_flush(unsigned long j); +unsigned long rcu_get_jiffies_lazy_flush(void); +void rcu_set_jiffies_lazy_flush(unsigned long j); #else -static inline unsigned long rcu_lazy_get_jiffies_till_flush(void) { return 0; } -static inline void rcu_lazy_set_jiffies_till_flush(unsigned long j) { } +static inline unsigned long rcu_get_jiffies_lazy_flush(void) { return 0; } +static inline void rcu_set_jiffies_lazy_flush(unsigned long j) { } #endif #if defined(CONFIG_TREE_RCU) diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c index ffdb30495e3cc3..8db4fedaaa1eb7 100644 --- a/kernel/rcu/rcuscale.c +++ b/kernel/rcu/rcuscale.c @@ -764,9 +764,9 @@ kfree_scale_init(void) if (kfree_by_call_rcu) { /* do a test to check the timeout. */ - orig_jif = rcu_lazy_get_jiffies_till_flush(); + orig_jif = rcu_get_jiffies_lazy_flush(); - rcu_lazy_set_jiffies_till_flush(2 * HZ); + rcu_set_jiffies_lazy_flush(2 * HZ); rcu_barrier(); jif_start = jiffies; @@ -775,7 +775,7 @@ kfree_scale_init(void) smp_cond_load_relaxed(&rcu_lazy_test1_cb_called, VAL == 1); - rcu_lazy_set_jiffies_till_flush(orig_jif); + rcu_set_jiffies_lazy_flush(orig_jif); if (WARN_ON_ONCE(jiffies_at_lazy_cb - jif_start < 2 * HZ)) { pr_alert("ERROR: call_rcu() CBs are not being lazy as expected!\n"); diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index 4efbf7333d4e16..aecef51166c7e2 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -256,6 +256,7 @@ static bool wake_nocb_gp(struct rcu_data *rdp, bool force) return __wake_nocb_gp(rdp_gp, rdp, force, flags); } +#ifdef CONFIG_RCU_LAZY /* * LAZY_FLUSH_JIFFIES decides the maximum amount of time that * can elapse before lazy callbacks are flushed. Lazy callbacks @@ -264,21 +265,20 @@ static bool wake_nocb_gp(struct rcu_data *rdp, bool force) * left unsubmitted to RCU after those many jiffies. */ #define LAZY_FLUSH_JIFFIES (10 * HZ) -static unsigned long jiffies_till_flush = LAZY_FLUSH_JIFFIES; +static unsigned long jiffies_lazy_flush = LAZY_FLUSH_JIFFIES; -#ifdef CONFIG_RCU_LAZY // To be called only from test code. -void rcu_lazy_set_jiffies_till_flush(unsigned long jif) +void rcu_set_jiffies_lazy_flush(unsigned long jif) { - jiffies_till_flush = jif; + jiffies_lazy_flush = jif; } -EXPORT_SYMBOL(rcu_lazy_set_jiffies_till_flush); +EXPORT_SYMBOL(rcu_set_jiffies_lazy_flush); -unsigned long rcu_lazy_get_jiffies_till_flush(void) +unsigned long rcu_get_jiffies_lazy_flush(void) { - return jiffies_till_flush; + return jiffies_lazy_flush; } -EXPORT_SYMBOL(rcu_lazy_get_jiffies_till_flush); +EXPORT_SYMBOL(rcu_get_jiffies_lazy_flush); #endif /* @@ -299,7 +299,7 @@ static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype, */ if (waketype == RCU_NOCB_WAKE_LAZY && rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT) { - mod_timer(&rdp_gp->nocb_timer, jiffies + jiffies_till_flush); + mod_timer(&rdp_gp->nocb_timer, jiffies + rcu_get_jiffies_lazy_flush()); WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype); } else if (waketype == RCU_NOCB_WAKE_BYPASS) { mod_timer(&rdp_gp->nocb_timer, jiffies + 2); @@ -482,7 +482,7 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp, // flush ->nocb_bypass to ->cblist. if ((ncbs && !bypass_is_lazy && j != READ_ONCE(rdp->nocb_bypass_first)) || (ncbs && bypass_is_lazy && - (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + jiffies_till_flush))) || + (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + rcu_get_jiffies_lazy_flush()))) || ncbs >= qhimark) { rcu_nocb_lock(rdp); *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist); @@ -723,7 +723,7 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) lazy_ncbs = READ_ONCE(rdp->lazy_len); if (bypass_ncbs && (lazy_ncbs == bypass_ncbs) && - (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + jiffies_till_flush) || + (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + rcu_get_jiffies_lazy_flush()) || bypass_ncbs > 2 * qhimark)) { flush_bypass = true; } else if (bypass_ncbs && (lazy_ncbs != bypass_ncbs) && From 573e094763eaccc6f4d58fea420ce8b890edbcbb Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 15 Nov 2023 14:11:27 -0500 Subject: [PATCH 200/707] rcu/nocb: Remove needless LOAD-ACQUIRE The LOAD-ACQUIRE access performed on rdp->nocb_cb_sleep advertizes ordering callback execution against grace period completion. However this is contradicted by the following: * This LOAD-ACQUIRE doesn't pair with anything. The only counterpart barrier that can be found is the smp_mb() placed after callbacks advancing in nocb_gp_wait(). However the barrier is placed _after_ ->nocb_cb_sleep write. * Callbacks can be concurrently advanced between the LOAD-ACQUIRE on ->nocb_cb_sleep and the call to rcu_segcblist_extract_done_cbs() in rcu_do_batch(), making any ordering based on ->nocb_cb_sleep broken. * Both rcu_segcblist_extract_done_cbs() and rcu_advance_cbs() are called under the nocb_lock, the latter hereby providing already the desired ACQUIRE semantics. Therefore it is safe to access ->nocb_cb_sleep with a simple compiler barrier. Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_nocb.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index aecef51166c7e2..eb27878d46f1fe 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -933,8 +933,7 @@ static void nocb_cb_wait(struct rcu_data *rdp) swait_event_interruptible_exclusive(rdp->nocb_cb_wq, nocb_cb_wait_cond(rdp)); - // VVV Ensure CB invocation follows _sleep test. - if (smp_load_acquire(&rdp->nocb_cb_sleep)) { // ^^^ + if (READ_ONCE(rdp->nocb_cb_sleep)) { WARN_ON(signal_pending(current)); trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty")); } From e1173cfcd991870c0782f5b8c6ea5e9ffed66771 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 15 Nov 2023 14:11:28 -0500 Subject: [PATCH 201/707] rcu/nocb: Remove needless full barrier after callback advancing A full barrier is issued from nocb_gp_wait() upon callbacks advancing to order grace period completion with callbacks execution. However these two events are already ordered by the smp_mb__after_unlock_lock() barrier within the call to raw_spin_lock_rcu_node() that is necessary for callbacks advancing to happen. The following litmus test shows the kind of guarantee that this barrier provides: C smp_mb__after_unlock_lock {} // rcu_gp_cleanup() P0(spinlock_t *rnp_lock, int *gpnum) { // Grace period cleanup increase gp sequence number spin_lock(rnp_lock); WRITE_ONCE(*gpnum, 1); spin_unlock(rnp_lock); } // nocb_gp_wait() P1(spinlock_t *rnp_lock, spinlock_t *nocb_lock, int *gpnum, int *cb_ready) { int r1; // Call rcu_advance_cbs() from nocb_gp_wait() spin_lock(nocb_lock); spin_lock(rnp_lock); smp_mb__after_unlock_lock(); r1 = READ_ONCE(*gpnum); WRITE_ONCE(*cb_ready, 1); spin_unlock(rnp_lock); spin_unlock(nocb_lock); } // nocb_cb_wait() P2(spinlock_t *nocb_lock, int *cb_ready, int *cb_executed) { int r2; // rcu_do_batch() -> rcu_segcblist_extract_done_cbs() spin_lock(nocb_lock); r2 = READ_ONCE(*cb_ready); spin_unlock(nocb_lock); // Actual callback execution WRITE_ONCE(*cb_executed, 1); } P3(int *cb_executed, int *gpnum) { int r3; WRITE_ONCE(*cb_executed, 2); smp_mb(); r3 = READ_ONCE(*gpnum); } exists (1:r1=1 /\ 2:r2=1 /\ cb_executed=2 /\ 3:r3=0) (* Bad outcome. *) Here the bad outcome only occurs if the smp_mb__after_unlock_lock() is removed. This barrier orders the grace period completion against callbacks advancing and even later callbacks invocation, thanks to the opportunistic propagation via the ->nocb_lock to nocb_cb_wait(). Therefore the smp_mb() placed after callbacks advancing can be safely removed. Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 6 ++++++ kernel/rcu/tree_nocb.h | 1 - 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index b2bccfd37c383d..d540d210e5c71a 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2145,6 +2145,12 @@ static void rcu_do_batch(struct rcu_data *rdp) * Extract the list of ready callbacks, disabling IRQs to prevent * races with call_rcu() from interrupt handlers. Leave the * callback counts, as rcu_barrier() needs to be conservative. + * + * Callbacks execution is fully ordered against preceding grace period + * completion (materialized by rnp->gp_seq update) thanks to the + * smp_mb__after_unlock_lock() upon node locking required for callbacks + * advancing. In NOCB mode this ordering is then further relayed through + * the nocb locking that protects both callbacks advancing and extraction. */ rcu_nocb_lock_irqsave(rdp, flags); WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index eb27878d46f1fe..d82f96a66600cb 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -779,7 +779,6 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) if (rcu_segcblist_ready_cbs(&rdp->cblist)) { needwake = rdp->nocb_cb_sleep; WRITE_ONCE(rdp->nocb_cb_sleep, false); - smp_mb(); /* CB invocation -after- GP end. */ } else { needwake = false; } From 8fc521f2a25ad3ca51d30ee0ecd0a693d2f9c6fc Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Sun, 3 Dec 2023 01:12:52 +0000 Subject: [PATCH 202/707] rcu: Provide a boot time parameter to control lazy RCU To allow more flexible arrangements while still provide a single kernel for distros, provide a boot time parameter to enable/disable lazy RCU. Specify: rcutree.enable_rcu_lazy=[y|1|n|0] Which also requires rcu_nocbs=all at boot time to enable/disable lazy RCU. To disable it by default at build time when CONFIG_RCU_LAZY=y, the new CONFIG_RCU_LAZY_DEFAULT_OFF can be used. Signed-off-by: Qais Yousef (Google) Tested-by: Andrea Righi Signed-off-by: Paul E. McKenney --- Documentation/admin-guide/kernel-parameters.txt | 5 +++++ kernel/rcu/Kconfig | 13 +++++++++++++ kernel/rcu/tree.c | 7 ++++++- 3 files changed, 24 insertions(+), 1 deletion(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 4839f2919fdfa6..94314d0eb3019b 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -5047,6 +5047,11 @@ this kernel boot parameter, forcibly setting it to zero. + rcutree.enable_rcu_lazy= [KNL] + To save power, batch RCU callbacks and flush after + delay, memory pressure or callback list growing too + big. + rcuscale.gp_async= [KNL] Measure performance of asynchronous grace-period primitives such as call_rcu(). diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index bdd7eadb33d8fe..e7d2dd2675931f 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -314,6 +314,19 @@ config RCU_LAZY To save power, batch RCU callbacks and flush after delay, memory pressure, or callback list growing too big. + Requires rcu_nocbs=all to be set. + + Use rcutree.enable_rcu_lazy=0 to turn it off at boot time. + +config RCU_LAZY_DEFAULT_OFF + bool "Turn RCU lazy invocation off by default" + depends on RCU_LAZY + default n + help + Allows building the kernel with CONFIG_RCU_LAZY=y yet keep it default + off. Boot time param rcutree.enable_rcu_lazy=1 can be used to switch + it back on. + config RCU_DOUBLE_CHECK_CB_TIME bool "RCU callback-batch backup time check" depends on RCU_EXPERT diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index d540d210e5c71a..49980323417606 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2759,6 +2759,9 @@ __call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy_in) } #ifdef CONFIG_RCU_LAZY +static bool enable_rcu_lazy __read_mostly = !IS_ENABLED(CONFIG_RCU_LAZY_DEFAULT_OFF); +module_param(enable_rcu_lazy, bool, 0444); + /** * call_rcu_hurry() - Queue RCU callback for invocation after grace period, and * flush all lazy callbacks (including the new one) to the main ->cblist while @@ -2784,6 +2787,8 @@ void call_rcu_hurry(struct rcu_head *head, rcu_callback_t func) __call_rcu_common(head, func, false); } EXPORT_SYMBOL_GPL(call_rcu_hurry); +#else +#define enable_rcu_lazy false #endif /** @@ -2832,7 +2837,7 @@ EXPORT_SYMBOL_GPL(call_rcu_hurry); */ void call_rcu(struct rcu_head *head, rcu_callback_t func) { - __call_rcu_common(head, func, IS_ENABLED(CONFIG_RCU_LAZY)); + __call_rcu_common(head, func, enable_rcu_lazy); } EXPORT_SYMBOL_GPL(call_rcu); From f17ad8bd109b904a9368a5fa85d8fa18b7c3e704 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 4 Dec 2023 20:34:58 -0800 Subject: [PATCH 203/707] context_tracking: Fix kerneldoc headers for __ct_user_{enter,exit}() Document the "state" parameter of both of these functions. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202312041922.YZCcEPYD-lkp@intel.com/ Signed-off-by: Paul E. McKenney Tested-by: Randy Dunlap Acked-by: Randy Dunlap Cc: Frederic Weisbecker --- kernel/context_tracking.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 6ef0b35fc28c5a..70ae70d0382337 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -458,6 +458,8 @@ static __always_inline void context_tracking_recursion_exit(void) * __ct_user_enter - Inform the context tracking that the CPU is going * to enter user or guest space mode. * + * @state: userspace context-tracking state to enter. + * * This function must be called right before we switch from the kernel * to user or guest space, when it's guaranteed the remaining kernel * instructions to execute won't use any RCU read side critical section @@ -595,6 +597,8 @@ NOKPROBE_SYMBOL(user_enter_callable); * __ct_user_exit - Inform the context tracking that the CPU is * exiting user or guest mode and entering the kernel. * + * @state: userspace context-tracking state being exited from. + * * This function must be called after we entered the kernel from user or * guest space before any use of RCU read side critical section. This * potentially include any high level kernel code like syscalls, exceptions, From de95870fd0fb9355904531693de92e01b27e688c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 11 Dec 2023 11:55:17 -0800 Subject: [PATCH 204/707] doc: Clarify use of slab constructors and SLAB_TYPESAFE_BY_RCU This commit explicitly states that you should initialize any locks to be used by readers in your SLAB_TYPESAFE_BY_RCU constructor. Signed-off-by: Paul E. McKenney --- Documentation/RCU/whatisRCU.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/RCU/whatisRCU.rst b/Documentation/RCU/whatisRCU.rst index 246ce0d0b4d116..872ac665223fbd 100644 --- a/Documentation/RCU/whatisRCU.rst +++ b/Documentation/RCU/whatisRCU.rst @@ -963,8 +963,8 @@ unfortunately any spinlock in a ``SLAB_TYPESAFE_BY_RCU`` object must be initialized after each and every call to kmem_cache_alloc(), which renders reference-free spinlock acquisition completely unsafe. Therefore, when using ``SLAB_TYPESAFE_BY_RCU``, make proper use of a reference counter. -(Those willing to use a kmem_cache constructor may also use locking, -including cache-friendly sequence locking.) +(Those willing to initialize their locks in a kmem_cache constructor +may also use locking, including cache-friendly sequence locking.) With traditional reference counting -- such as that implemented by the kref library in Linux -- there is typically code that runs when the last From 18cfea5d54482c8ba35a2011db5685100455862b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 13 Dec 2023 09:49:20 -0800 Subject: [PATCH 205/707] doc: Update checklist.rst discussion of callback execution This commit completes the list of call_rcu*() functions that are not guaranteed to have their callbacks executing on the same CPU. While in the area, fix an unrelated typo. Signed-off-by: Paul E. McKenney --- Documentation/RCU/checklist.rst | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/Documentation/RCU/checklist.rst b/Documentation/RCU/checklist.rst index addd5c1547a420..3e6407de231c99 100644 --- a/Documentation/RCU/checklist.rst +++ b/Documentation/RCU/checklist.rst @@ -383,16 +383,17 @@ over a rather long period of time, but improvements are always welcome! must use whatever locking or other synchronization is required to safely access and/or modify that data structure. - Do not assume that RCU callbacks will be executed on the same - CPU that executed the corresponding call_rcu() or call_srcu(). - For example, if a given CPU goes offline while having an RCU - callback pending, then that RCU callback will execute on some - surviving CPU. (If this was not the case, a self-spawning RCU - callback would prevent the victim CPU from ever going offline.) - Furthermore, CPUs designated by rcu_nocbs= might well *always* - have their RCU callbacks executed on some other CPUs, in fact, - for some real-time workloads, this is the whole point of using - the rcu_nocbs= kernel boot parameter. + Do not assume that RCU callbacks will be executed on + the same CPU that executed the corresponding call_rcu(), + call_srcu(), call_rcu_tasks(), call_rcu_tasks_rude(), or + call_rcu_tasks_trace(). For example, if a given CPU goes offline + while having an RCU callback pending, then that RCU callback + will execute on some surviving CPU. (If this was not the case, + a self-spawning RCU callback would prevent the victim CPU from + ever going offline.) Furthermore, CPUs designated by rcu_nocbs= + might well *always* have their RCU callbacks executed on some + other CPUs, in fact, for some real-time workloads, this is the + whole point of using the rcu_nocbs= kernel boot parameter. In addition, do not assume that callbacks queued in a given order will be invoked in that order, even if they all are queued on the From ff0f64285a9e6d3e5390ba16252ec8c410577fc3 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 19 Dec 2023 00:19:14 +0100 Subject: [PATCH 206/707] hrtimer: Report offline hrtimer enqueue The hrtimers migration on CPU-down hotplug process has been moved earlier, before the CPU actually goes to die. This leaves a small window of opportunity to queue an hrtimer in a blind spot, leaving it ignored. For example a practical case has been reported with RCU waking up a SCHED_FIFO task right before the CPUHP_AP_IDLE_DEAD stage, queuing that way a sched/rt timer to the local offline CPU. Make sure such situations never go unnoticed and warn when that happens. [ paulmck: Apply Stephen Rothwell feedback. ] Reported-by: Paul E. McKenney Fixes: 5c0930ccaad5 ("hrtimers: Push pending hrtimers away from outgoing CPU earlier") Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- include/linux/hrtimer.h | 4 +++- kernel/time/hrtimer.c | 3 +++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 87e3bedf8eb003..991c83e929b456 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -151,6 +151,7 @@ enum hrtimer_base_type { * @hang_detected: The last hrtimer interrupt detected a hang * @softirq_activated: displays, if the softirq is raised - update of softirq * related settings is not required then. + * @online: CPU is online from an hrtimers viewpoint * @nr_events: Total number of hrtimer interrupt events * @nr_retries: Total number of hrtimer interrupt retries * @nr_hangs: Total number of hrtimer interrupt hangs @@ -179,7 +180,8 @@ struct hrtimer_cpu_base { unsigned int hres_active : 1, in_hrtirq : 1, hang_detected : 1, - softirq_activated : 1; + softirq_activated : 1, + online : 1; #ifdef CONFIG_HIGH_RES_TIMERS unsigned int nr_events; unsigned short nr_retries; diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 760793998cdd70..edb0f821dceaa1 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1085,6 +1085,7 @@ static int enqueue_hrtimer(struct hrtimer *timer, enum hrtimer_mode mode) { debug_activate(timer, mode); + WARN_ON_ONCE(!base->cpu_base->online); base->cpu_base->active_bases |= 1 << base->index; @@ -2183,6 +2184,7 @@ int hrtimers_prepare_cpu(unsigned int cpu) cpu_base->softirq_next_timer = NULL; cpu_base->expires_next = KTIME_MAX; cpu_base->softirq_expires_next = KTIME_MAX; + cpu_base->online = 1; hrtimer_cpu_base_init_expiry_lock(cpu_base); return 0; } @@ -2250,6 +2252,7 @@ int hrtimers_cpu_dying(unsigned int dying_cpu) smp_call_function_single(ncpu, retrigger_next_event, NULL, 0); raw_spin_unlock(&new_base->lock); + old_base->online = 0; raw_spin_unlock(&old_base->lock); return 0; From bd4f7f10c9028148395beec6d79e42f5c98568c0 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 19 Dec 2023 00:19:16 +0100 Subject: [PATCH 207/707] rcu/exp: Remove full barrier upon main thread wakeup When an expedited grace period is ending, care must be taken so that all the quiescent states propagated up to the root are correctly ordered against the wake up of the main expedited grace period workqueue. This ordering is already carried through the root rnp locking augmented by an smp_mb__after_unlock_lock() barrier. Therefore the explicit smp_mb() placed before the wake up is not needed and can be removed. Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_exp.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 2ac440bc7e10bc..014ddf672165d3 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -198,10 +198,9 @@ static void __rcu_report_exp_rnp(struct rcu_node *rnp, } if (rnp->parent == NULL) { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - if (wake) { - smp_mb(); /* EGP done before wake_up(). */ + if (wake) swake_up_one_online(&rcu_state.expedited_wq); - } + break; } mask = rnp->grpmask; From f5b859c4a2fe44c0ece7aad9dd2e3d3b7fd0385c Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Wed, 27 Dec 2023 12:47:38 -0500 Subject: [PATCH 208/707] srcu: Improve comments about acceleration leak The comments added in commit 1ef990c4b36b ("srcu: No need to advance/accelerate if no callback enqueued") are a bit confusing. The comments are describing a scenario for code that was moved and is no longer the way it was (snapshot after advancing). Improve the code comments to reflect this and also document why acceleration can never fail. Cc: Frederic Weisbecker Cc: Neeraj Upadhyay Reviewed-by: Frederic Weisbecker Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 0351a4e83529e3..e4d673fc30f42f 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -1234,11 +1234,20 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp, if (rhp) rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp); /* - * The snapshot for acceleration must be taken _before_ the read of the - * current gp sequence used for advancing, otherwise advancing may fail - * and acceleration may then fail too. + * It's crucial to capture the snapshot 's' for acceleration before + * reading the current gp_seq that is used for advancing. This is + * essential because if the acceleration snapshot is taken after a + * failed advancement attempt, there's a risk that a grace period may + * conclude and a new one may start in the interim. If the snapshot is + * captured after this sequence of events, the acceleration snapshot 's' + * could be excessively advanced, leading to acceleration failure. + * In such a scenario, an 'acceleration leak' can occur, where new + * callbacks become indefinitely stuck in the RCU_NEXT_TAIL segment. + * Also note that encountering advancing failures is a normal + * occurrence when the grace period for RCU_WAIT_TAIL is in progress. * - * This could happen if: + * To see this, consider the following events which occur if + * rcu_seq_snap() were to be called after advance: * * 1) The RCU_WAIT_TAIL segment has callbacks (gp_num = X + 4) and the * RCU_NEXT_READY_TAIL also has callbacks (gp_num = X + 8). @@ -1264,6 +1273,13 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp, if (rhp) { rcu_segcblist_advance(&sdp->srcu_cblist, rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq)); + /* + * Acceleration can never fail because the base current gp_seq + * used for acceleration is <= the value of gp_seq used for + * advancing. This means that RCU_NEXT_TAIL segment will + * always be able to be emptied by the acceleration into the + * RCU_NEXT_READY_TAIL or RCU_WAIT_TAIL segments. + */ WARN_ON_ONCE(!rcu_segcblist_accelerate(&sdp->srcu_cblist, s)); } if (ULONG_CMP_LT(sdp->srcu_gp_seq_needed, s)) { From 296088f9b5a9a98bd84b19f71b12600c7bfa2c05 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 2 Jan 2024 15:55:12 -0800 Subject: [PATCH 209/707] tsc: Check for sockets instead of CPUs to make code match comment The unsynchronized_tsc() eventually checks num_possible_cpus(), and if the system is non-Intel and the number of possible CPUs is greater than one, assumes that TSCs are unsynchronized. This despite the comment saying "assume multi socket systems are not synchronized", that is, socket rather than CPU. This behavior was preserved by commit 8fbbc4b45ce3 ("x86: merge tsc_init and clocksource code") and by the previous relevant commit 7e69f2b1ead2 ("clocksource: Remove the update callback"). The clocksource drivers were added by commit 5d0cf410e94b ("Time: i386 Clocksource Drivers") back in 2006, and the comment still said "socket" rather than "CPU". Therefore, bravely (and perhaps foolishly) make the code match the comment. Note that it is possible to bypass both code and comment by booting with tsc=reliable, but this also disables the clocksource watchdog, which is undesirable when trust in the TSC is strictly limited. Reported-by: Zhengxu Chen Reported-by: Danielle Costantino Signed-off-by: Paul E. McKenney Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Dave Hansen Cc: "H. Peter Anvin" Cc: Peter Zijlstra Cc: Feng Tang Cc: Waiman Long Cc: John Stultz Cc: --- arch/x86/kernel/tsc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 15f97c0abc9d09..d45084c6a15ed3 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -1287,7 +1287,7 @@ int unsynchronized_tsc(void) */ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) { /* assume multi socket systems are not synchronized: */ - if (num_possible_cpus() > 1) + if (nr_online_nodes > 1) return 1; } From 5dd763ec737d07330f3b9f4cdc78318a96bcc7af Mon Sep 17 00:00:00 2001 From: Zhenhua Huang Date: Tue, 2 Jan 2024 18:19:37 +0800 Subject: [PATCH 210/707] fs/proc: remove redudant comments from /proc/bootconfig commit 717c7c894d4b ("fs/proc: Add boot loader arguments as comment to /proc/bootconfig") adds bootloader argument comments into /proc/bootconfig. /proc/bootconfig shows boot_command_line[] multiple times following every xbc key value pair, that's duplicated and not necessary. Remove redundant ones. Output before and after the fix is like: key1 = value1 *bootloader argument comments* key2 = value2 *bootloader argument comments* key3 = value3 *bootloader argument comments* ... key1 = value1 key2 = value2 key3 = value3 *bootloader argument comments* ... Fixes: 717c7c894d4b ("fs/proc: Add boot loader arguments as comment to /proc/bootconfig") Signed-off-by: Zhenhua Huang Signed-off-by: Paul E. McKenney --- fs/proc/bootconfig.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/proc/bootconfig.c b/fs/proc/bootconfig.c index 902b326e1e5607..e5635a6b127b0b 100644 --- a/fs/proc/bootconfig.c +++ b/fs/proc/bootconfig.c @@ -62,12 +62,12 @@ static int __init copy_xbc_key_value_list(char *dst, size_t size) break; dst += ret; } - if (ret >= 0 && boot_command_line[0]) { - ret = snprintf(dst, rest(dst, end), "# Parameters from bootloader:\n# %s\n", - boot_command_line); - if (ret > 0) - dst += ret; - } + } + if (ret >= 0 && boot_command_line[0]) { + ret = snprintf(dst, rest(dst, end), "# Parameters from bootloader:\n# %s\n", + boot_command_line); + if (ret > 0) + dst += ret; } out: kfree(key); From cf59604abb7e8e7784d105a4476e63ca1ccaa209 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 3 Jan 2024 10:59:25 -0800 Subject: [PATCH 211/707] rcutorture: Suppress rtort_pipe_count warnings until after stalls Currently, if rcu_torture_writer() sees fewer than ten grace periods having elapsed during a call to stutter_wait() that actually waited, the rtort_pipe_count warning is emitted. This has worked well for a long time. Except that the rcutorture TREE07 scenario now does a short-term 14-second RCU CPU stall, which can most definitely case false-positive rtort_pipe_count warnings. This commit therefore changes rcu_torture_writer() to compute the full expected holdoff and stall duration, and to refuse to report any rtort_pipe_count warnings until after all stalls have completed. Signed-off-by: Paul E. McKenney Cc: Frederic Weisbecker --- kernel/rcu/rcutorture.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 7567ca8e743ca6..45d6b4c3d199c1 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1368,9 +1368,13 @@ rcu_torture_writer(void *arg) struct rcu_torture *rp; struct rcu_torture *old_rp; static DEFINE_TORTURE_RANDOM(rand); + unsigned long stallsdone = jiffies; bool stutter_waited; unsigned long ulo[NUM_ACTIVE_RCU_POLL_OLDSTATE]; + // If a new stall test is added, this must be adjusted. + if (stall_cpu_holdoff + stall_gp_kthread + stall_cpu) + stallsdone += (stall_cpu_holdoff + stall_gp_kthread + stall_cpu + 60) * HZ; VERBOSE_TOROUT_STRING("rcu_torture_writer task started"); if (!can_expedite) pr_alert("%s" TORTURE_FLAG @@ -1576,11 +1580,11 @@ rcu_torture_writer(void *arg) !atomic_read(&rcu_fwd_cb_nodelay) && !cur_ops->slow_gps && !torture_must_stop() && - boot_ended) + boot_ended && + time_after(jiffies, stallsdone)) for (i = 0; i < ARRAY_SIZE(rcu_tortures); i++) if (list_empty(&rcu_tortures[i].rtort_free) && - rcu_access_pointer(rcu_torture_current) != - &rcu_tortures[i]) { + rcu_access_pointer(rcu_torture_current) != &rcu_tortures[i]) { tracing_off(); show_rcu_gp_kthreads(); WARN(1, "%s: rtort_pipe_count: %d\n", __func__, rcu_tortures[i].rtort_pipe_count); @@ -2441,7 +2445,8 @@ static struct notifier_block rcu_torture_stall_block = { /* * CPU-stall kthread. It waits as specified by stall_cpu_holdoff, then - * induces a CPU stall for the time specified by stall_cpu. + * induces a CPU stall for the time specified by stall_cpu. If a new + * stall test is added, stallsdone in rcu_torture_writer() must be adjusted. */ static int rcu_torture_stall(void *args) { From 5d15c90ead66043db4ab79c17e35f658e1a3ea51 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 9 Jan 2024 23:24:00 +0100 Subject: [PATCH 212/707] rcu/nocb: Make IRQs disablement symmetric Currently IRQs are disabled on call_rcu() and then depending on the context: * If the CPU is in nocb mode: - If the callback is enqueued in the bypass list, IRQs are re-enabled implictly by rcu_nocb_try_bypass() - If the callback is enqueued in the normal list, IRQs are re-enabled implicitly by __call_rcu_nocb_wake() * If the CPU is NOT in nocb mode, IRQs are reenabled explicitly from call_rcu() This makes the code a bit hard to follow, especially as it interleaves with nocb locking. To make the IRQ flags coverage clearer and also in order to prepare for moving all the nocb enqueue code to its own function, always re-enable the IRQ flags explicitly from call_rcu(). Reviewed-by: Neeraj Upadhyay (AMD) Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 9 ++++++--- kernel/rcu/tree_nocb.h | 20 +++++++++----------- 2 files changed, 15 insertions(+), 14 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 49980323417606..91b2eb772e861f 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2735,8 +2735,10 @@ __call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy_in) } check_cb_ovld(rdp); - if (rcu_nocb_try_bypass(rdp, head, &was_alldone, flags, lazy)) + if (rcu_nocb_try_bypass(rdp, head, &was_alldone, flags, lazy)) { + local_irq_restore(flags); return; // Enqueued onto ->nocb_bypass, so just leave. + } // If no-CBs CPU gets here, rcu_nocb_try_bypass() acquired ->nocb_lock. rcu_segcblist_enqueue(&rdp->cblist, head); if (__is_kvfree_rcu_offset((unsigned long)func)) @@ -2754,8 +2756,8 @@ __call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy_in) __call_rcu_nocb_wake(rdp, was_alldone, flags); /* unlocks */ } else { __call_rcu_core(rdp, head, flags); - local_irq_restore(flags); } + local_irq_restore(flags); } #ifdef CONFIG_RCU_LAZY @@ -4651,8 +4653,9 @@ void rcutree_migrate_callbacks(int cpu) __call_rcu_nocb_wake(my_rdp, true, flags); } else { rcu_nocb_unlock(my_rdp); /* irqs remain disabled. */ - raw_spin_unlock_irqrestore_rcu_node(my_rnp, flags); + raw_spin_unlock_rcu_node(my_rnp); /* irqs remain disabled. */ } + local_irq_restore(flags); if (needwake) rcu_gp_kthread_wake(); lockdep_assert_irqs_enabled(); diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index d82f96a66600cb..06c8ff85850ccb 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -532,9 +532,7 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp, // 2. Both of these conditions are met: // a. The bypass list previously had only lazy CBs, and: // b. The new CB is non-lazy. - if (ncbs && (!bypass_is_lazy || lazy)) { - local_irq_restore(flags); - } else { + if (!ncbs || (bypass_is_lazy && !lazy)) { // No-CBs GP kthread might be indefinitely asleep, if so, wake. rcu_nocb_lock(rdp); // Rare during call_rcu() flood. if (!rcu_segcblist_pend_cbs(&rdp->cblist)) { @@ -544,7 +542,7 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp, } else { trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("FirstBQnoWake")); - rcu_nocb_unlock_irqrestore(rdp, flags); + rcu_nocb_unlock(rdp); } } return true; // Callback already enqueued. @@ -570,7 +568,7 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone, // If we are being polled or there is no kthread, just leave. t = READ_ONCE(rdp->nocb_gp_kthread); if (rcu_nocb_poll || !t) { - rcu_nocb_unlock_irqrestore(rdp, flags); + rcu_nocb_unlock(rdp); trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNotPoll")); return; @@ -583,17 +581,17 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone, rdp->qlen_last_fqs_check = len; // Only lazy CBs in bypass list if (lazy_len && bypass_len == lazy_len) { - rcu_nocb_unlock_irqrestore(rdp, flags); + rcu_nocb_unlock(rdp); wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_LAZY, TPS("WakeLazy")); } else if (!irqs_disabled_flags(flags)) { /* ... if queue was empty ... */ - rcu_nocb_unlock_irqrestore(rdp, flags); + rcu_nocb_unlock(rdp); wake_nocb_gp(rdp, false); trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeEmpty")); } else { - rcu_nocb_unlock_irqrestore(rdp, flags); + rcu_nocb_unlock(rdp); wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE, TPS("WakeEmptyIsDeferred")); } @@ -611,15 +609,15 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone, if ((rdp->nocb_cb_sleep || !rcu_segcblist_ready_cbs(&rdp->cblist)) && !timer_pending(&rdp->nocb_timer)) { - rcu_nocb_unlock_irqrestore(rdp, flags); + rcu_nocb_unlock(rdp); wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_FORCE, TPS("WakeOvfIsDeferred")); } else { - rcu_nocb_unlock_irqrestore(rdp, flags); + rcu_nocb_unlock(rdp); trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot")); } } else { - rcu_nocb_unlock_irqrestore(rdp, flags); + rcu_nocb_unlock(rdp); trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot")); } } From c1856a43a68ada47d1285d4c7223d3bca94e3f44 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 9 Jan 2024 23:24:01 +0100 Subject: [PATCH 213/707] rcu/nocb: Re-arrange call_rcu() NOCB specific code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently the call_rcu() function interleaves NOCB and !NOCB enqueue code in a complicated way such that: * The bypass enqueue code may or may not have enqueued and may or may not have locked the ->nocb_lock. Everything that follows is in a Schrödinger locking state for the unwary reviewer's eyes. * The was_alldone is always set but only used in NOCB related code. * The NOCB wake up is distantly related to the locking hopefully performed by the bypass enqueue code that did not enqueue on the bypass list. Unconfuse the whole and gather NOCB and !NOCB specific enqueue code to their own functions. Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 44 +++++++++++++++++++----------------------- kernel/rcu/tree.h | 9 ++++----- kernel/rcu/tree_nocb.h | 18 ++++++++++++++--- 3 files changed, 39 insertions(+), 32 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 91b2eb772e861f..de5796ce024fec 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2597,12 +2597,26 @@ static int __init rcu_spawn_core_kthreads(void) return 0; } +static void rcutree_enqueue(struct rcu_data *rdp, struct rcu_head *head, rcu_callback_t func) +{ + rcu_segcblist_enqueue(&rdp->cblist, head); + if (__is_kvfree_rcu_offset((unsigned long)func)) + trace_rcu_kvfree_callback(rcu_state.name, head, + (unsigned long)func, + rcu_segcblist_n_cbs(&rdp->cblist)); + else + trace_rcu_callback(rcu_state.name, head, + rcu_segcblist_n_cbs(&rdp->cblist)); + trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCBQueued")); +} + /* * Handle any core-RCU processing required by a call_rcu() invocation. */ -static void __call_rcu_core(struct rcu_data *rdp, struct rcu_head *head, - unsigned long flags) +static void call_rcu_core(struct rcu_data *rdp, struct rcu_head *head, + rcu_callback_t func, unsigned long flags) { + rcutree_enqueue(rdp, head, func); /* * If called from an extended quiescent state, invoke the RCU * core in order to force a re-evaluation of RCU's idleness. @@ -2698,7 +2712,6 @@ __call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy_in) unsigned long flags; bool lazy; struct rcu_data *rdp; - bool was_alldone; /* Misaligned rcu_head! */ WARN_ON_ONCE((unsigned long)head & (sizeof(void *) - 1)); @@ -2735,28 +2748,11 @@ __call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy_in) } check_cb_ovld(rdp); - if (rcu_nocb_try_bypass(rdp, head, &was_alldone, flags, lazy)) { - local_irq_restore(flags); - return; // Enqueued onto ->nocb_bypass, so just leave. - } - // If no-CBs CPU gets here, rcu_nocb_try_bypass() acquired ->nocb_lock. - rcu_segcblist_enqueue(&rdp->cblist, head); - if (__is_kvfree_rcu_offset((unsigned long)func)) - trace_rcu_kvfree_callback(rcu_state.name, head, - (unsigned long)func, - rcu_segcblist_n_cbs(&rdp->cblist)); - else - trace_rcu_callback(rcu_state.name, head, - rcu_segcblist_n_cbs(&rdp->cblist)); - - trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCBQueued")); - /* Go handle any RCU core processing required. */ - if (unlikely(rcu_rdp_is_offloaded(rdp))) { - __call_rcu_nocb_wake(rdp, was_alldone, flags); /* unlocks */ - } else { - __call_rcu_core(rdp, head, flags); - } + if (unlikely(rcu_rdp_is_offloaded(rdp))) + call_rcu_nocb(rdp, head, func, flags, lazy); + else + call_rcu_core(rdp, head, func, flags); local_irq_restore(flags); } diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index e9821a8422dbe7..bf478da89a8f33 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -467,11 +467,10 @@ static void rcu_init_one_nocb(struct rcu_node *rnp); static bool wake_nocb_gp(struct rcu_data *rdp, bool force); static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp, unsigned long j, bool lazy); -static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp, - bool *was_alldone, unsigned long flags, - bool lazy); -static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty, - unsigned long flags); +static void call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *head, + rcu_callback_t func, unsigned long flags, bool lazy); +static void __maybe_unused __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty, + unsigned long flags); static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp, int level); static bool do_nocb_deferred_wakeup(struct rcu_data *rdp); static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp); diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index 06c8ff85850ccb..5fd47ea6d20eaa 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -622,6 +622,18 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone, } } +static void call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *head, + rcu_callback_t func, unsigned long flags, bool lazy) +{ + bool was_alldone; + + if (!rcu_nocb_try_bypass(rdp, head, &was_alldone, flags, lazy)) { + /* Not enqueued on bypass but locked, do regular enqueue */ + rcutree_enqueue(rdp, head, func); + __call_rcu_nocb_wake(rdp, was_alldone, flags); /* unlocks */ + } +} + static int nocb_gp_toggle_rdp(struct rcu_data *rdp, bool *wake_state) { @@ -1764,10 +1776,10 @@ static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp, return true; } -static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp, - bool *was_alldone, unsigned long flags, bool lazy) +static void call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *head, + rcu_callback_t func, unsigned long flags, bool lazy) { - return false; + WARN_ON_ONCE(1); /* Should be dead code! */ } static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty, From 862890ed53fb1da491874f1cab49f4b45c2151bc Mon Sep 17 00:00:00 2001 From: Zqiang Date: Wed, 10 Jan 2024 16:11:28 +0800 Subject: [PATCH 214/707] rcu/nocb: Fix WARN_ON_ONCE() in the rcu_nocb_bypass_lock() For the kernels built with CONFIG_RCU_NOCB_CPU_DEFAULT_ALL=y and CONFIG_RCU_LAZY=y, the following scenarios will trigger WARN_ON_ONCE() in the rcu_nocb_bypass_lock() and rcu_nocb_wait_contended() functions: CPU2 CPU11 kthread rcu_nocb_cb_kthread ksys_write rcu_do_batch vfs_write rcu_torture_timer_cb proc_sys_write __kmem_cache_free proc_sys_call_handler kmemleak_free drop_caches_sysctl_handler delete_object_full drop_slab __delete_object shrink_slab put_object lazy_rcu_shrink_scan call_rcu rcu_nocb_flush_bypass __call_rcu_commn rcu_nocb_bypass_lock raw_spin_trylock(&rdp->nocb_bypass_lock) fail atomic_inc(&rdp->nocb_lock_contended); rcu_nocb_wait_contended WARN_ON_ONCE(smp_processor_id() != rdp->cpu); WARN_ON_ONCE(atomic_read(&rdp->nocb_lock_contended)) | |_ _ _ _ _ _ _ _ _ _same rdp and rdp->cpu != 11_ _ _ _ _ _ _ _ _ __| Reproduce this bug with "echo 3 > /proc/sys/vm/drop_caches". This commit therefore uses rcu_nocb_try_flush_bypass() instead of rcu_nocb_flush_bypass() in lazy_rcu_shrink_scan(). If the nocb_bypass queue is being flushed, then rcu_nocb_try_flush_bypass will return directly. Signed-off-by: Zqiang Reviewed-by: Joel Fernandes (Google) Reviewed-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_nocb.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index 5fd47ea6d20eaa..54971afc3a9b25 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -1391,7 +1391,7 @@ lazy_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) rcu_nocb_unlock_irqrestore(rdp, flags); continue; } - WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies, false)); + rcu_nocb_try_flush_bypass(rdp, jiffies); rcu_nocb_unlock_irqrestore(rdp, flags); wake_nocb_gp(rdp, false); sc->nr_to_scan -= _count; From 35a2b37256083fa5302f87a106261cdb4a11a3a9 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 12 Jan 2024 16:46:15 +0100 Subject: [PATCH 215/707] rcu/exp: Fix RCU expedited parallel grace period kworker allocation failure recovery Under CONFIG_RCU_EXP_KTHREAD=y, the nodes initialization for expedited grace periods is queued to a kworker. However if the allocation of that kworker failed, the nodes initialization is performed synchronously by the caller instead. Now the check for kworker initialization failure relies on the kworker pointer to be NULL while its value might actually encapsulate an allocation failure error. Make sure to handle this case. Reviewed-by: Kalesh Singh Fixes: 9621fbee44df ("rcu: Move expedited grace period (GP) work to RT kthread_worker") Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index de5796ce024fec..65d730a2b492e5 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4759,6 +4759,7 @@ static void __init rcu_start_exp_gp_kworkers(void) rcu_exp_par_gp_kworker = kthread_create_worker(0, par_gp_kworker_name); if (IS_ERR_OR_NULL(rcu_exp_par_gp_kworker)) { pr_err("Failed to create %s!\n", par_gp_kworker_name); + rcu_exp_par_gp_kworker = NULL; kthread_destroy_worker(rcu_exp_gp_kworker); return; } From f3a491f0ecde531df2a3dd75684565e959a16f5a Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 12 Jan 2024 16:46:16 +0100 Subject: [PATCH 216/707] rcu/exp: Handle RCU expedited grace period kworker allocation failure Just like is done for the kworker performing nodes initialization, gracefully handle the possible allocation failure of the RCU expedited grace period main kworker. While at it perform a rename of the related checking functions to better reflect the expedited specifics. Reviewed-by: Kalesh Singh Fixes: 9621fbee44df ("rcu: Move expedited grace period (GP) work to RT kthread_worker") Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 ++ kernel/rcu/tree_exp.h | 25 +++++++++++++++++++------ 2 files changed, 21 insertions(+), 6 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 65d730a2b492e5..3777fd305f2ef5 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4753,6 +4753,7 @@ static void __init rcu_start_exp_gp_kworkers(void) rcu_exp_gp_kworker = kthread_create_worker(0, gp_kworker_name); if (IS_ERR_OR_NULL(rcu_exp_gp_kworker)) { pr_err("Failed to create %s!\n", gp_kworker_name); + rcu_exp_gp_kworker = NULL; return; } @@ -4761,6 +4762,7 @@ static void __init rcu_start_exp_gp_kworkers(void) pr_err("Failed to create %s!\n", par_gp_kworker_name); rcu_exp_par_gp_kworker = NULL; kthread_destroy_worker(rcu_exp_gp_kworker); + rcu_exp_gp_kworker = NULL; return; } diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 014ddf672165d3..6123a60d9a4d7c 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -427,7 +427,12 @@ static void sync_rcu_exp_select_node_cpus(struct kthread_work *wp) __sync_rcu_exp_select_node_cpus(rewp); } -static inline bool rcu_gp_par_worker_started(void) +static inline bool rcu_exp_worker_started(void) +{ + return !!READ_ONCE(rcu_exp_gp_kworker); +} + +static inline bool rcu_exp_par_worker_started(void) { return !!READ_ONCE(rcu_exp_par_gp_kworker); } @@ -477,7 +482,12 @@ static void sync_rcu_exp_select_node_cpus(struct work_struct *wp) __sync_rcu_exp_select_node_cpus(rewp); } -static inline bool rcu_gp_par_worker_started(void) +static inline bool rcu_exp_worker_started(void) +{ + return !!READ_ONCE(rcu_gp_wq); +} + +static inline bool rcu_exp_par_worker_started(void) { return !!READ_ONCE(rcu_par_gp_wq); } @@ -540,7 +550,7 @@ static void sync_rcu_exp_select_cpus(void) rnp->exp_need_flush = false; if (!READ_ONCE(rnp->expmask)) continue; /* Avoid early boot non-existent wq. */ - if (!rcu_gp_par_worker_started() || + if (!rcu_exp_par_worker_started() || rcu_scheduler_active != RCU_SCHEDULER_RUNNING || rcu_is_last_leaf_node(rnp)) { /* No worker started yet or last leaf, do direct call. */ @@ -955,7 +965,7 @@ static void rcu_exp_print_detail_task_stall_rnp(struct rcu_node *rnp) */ void synchronize_rcu_expedited(void) { - bool boottime = (rcu_scheduler_active == RCU_SCHEDULER_INIT); + bool use_worker; unsigned long flags; struct rcu_exp_work rew; struct rcu_node *rnp; @@ -966,6 +976,9 @@ void synchronize_rcu_expedited(void) lock_is_held(&rcu_sched_lock_map), "Illegal synchronize_rcu_expedited() in RCU read-side critical section"); + use_worker = (rcu_scheduler_active != RCU_SCHEDULER_INIT) && + rcu_exp_worker_started(); + /* Is the state is such that the call is a grace period? */ if (rcu_blocking_is_gp()) { // Note well that this code runs with !PREEMPT && !SMP. @@ -995,7 +1008,7 @@ void synchronize_rcu_expedited(void) return; /* Someone else did our work for us. */ /* Ensure that load happens before action based on it. */ - if (unlikely(boottime)) { + if (unlikely(!use_worker)) { /* Direct call during scheduler init and early_initcalls(). */ rcu_exp_sel_wait_wake(s); } else { @@ -1013,7 +1026,7 @@ void synchronize_rcu_expedited(void) /* Let the next expedited grace period start. */ mutex_unlock(&rcu_state.exp_mutex); - if (likely(!boottime)) + if (likely(use_worker)) synchronize_rcu_expedited_destroy_work(&rew); } EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); From 7eebf1849f6e455861215c5710c8d55eb581a4cc Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 12 Jan 2024 16:46:17 +0100 Subject: [PATCH 217/707] rcu: s/boost_kthread_mutex/kthread_mutex This mutex is currently protecting per node boost kthreads creation and affinity setting across CPU hotplug operations. Since the expedited kworkers will soon be split per node as well, they will be subject to the same concurrency constraints against hotplug. Therefore their creation and affinity tuning operations will be grouped with those of boost kthreads and then rely on the same mutex. To prepare for that, generalize its name. Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- kernel/rcu/tree.h | 2 +- kernel/rcu/tree_plugin.h | 10 +++++----- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 3777fd305f2ef5..0bf6971895194b 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4928,7 +4928,7 @@ static void __init rcu_init_one(void) init_waitqueue_head(&rnp->exp_wq[2]); init_waitqueue_head(&rnp->exp_wq[3]); spin_lock_init(&rnp->exp_lock); - mutex_init(&rnp->boost_kthread_mutex); + mutex_init(&rnp->kthread_mutex); raw_spin_lock_init(&rnp->exp_poll_lock); rnp->exp_seq_poll_rq = RCU_GET_STATE_COMPLETED; INIT_WORK(&rnp->exp_poll_wq, sync_rcu_do_polled_gp); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index bf478da89a8f33..adf8609f27d038 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -113,7 +113,7 @@ struct rcu_node { /* side effect, not as a lock. */ unsigned long boost_time; /* When to start boosting (jiffies). */ - struct mutex boost_kthread_mutex; + struct mutex kthread_mutex; /* Exclusion for thread spawning and affinity */ /* manipulation. */ struct task_struct *boost_kthread_task; diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 41021080ad258d..0d307674915c60 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1195,7 +1195,7 @@ static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp) struct sched_param sp; struct task_struct *t; - mutex_lock(&rnp->boost_kthread_mutex); + mutex_lock(&rnp->kthread_mutex); if (rnp->boost_kthread_task || !rcu_scheduler_fully_active) goto out; @@ -1212,7 +1212,7 @@ static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp) wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */ out: - mutex_unlock(&rnp->boost_kthread_mutex); + mutex_unlock(&rnp->kthread_mutex); } /* @@ -1224,7 +1224,7 @@ static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp) * no outgoing CPU. If there are no CPUs left in the affinity set, * this function allows the kthread to execute on any CPU. * - * Any future concurrent calls are serialized via ->boost_kthread_mutex. + * Any future concurrent calls are serialized via ->kthread_mutex. */ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) { @@ -1237,7 +1237,7 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) return; if (!zalloc_cpumask_var(&cm, GFP_KERNEL)) return; - mutex_lock(&rnp->boost_kthread_mutex); + mutex_lock(&rnp->kthread_mutex); mask = rcu_rnp_online_cpus(rnp); for_each_leaf_node_possible_cpu(rnp, cpu) if ((mask & leaf_node_cpu_bit(rnp, cpu)) && @@ -1250,7 +1250,7 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) cpumask_clear_cpu(outgoingcpu, cm); } set_cpus_allowed_ptr(t, cm); - mutex_unlock(&rnp->boost_kthread_mutex); + mutex_unlock(&rnp->kthread_mutex); free_cpumask_var(cm); } From a7658d497dfc48b8b35ffe97a599f891aa8d0d75 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 12 Jan 2024 16:46:18 +0100 Subject: [PATCH 218/707] rcu/exp: Move expedited kthread worker creation functions above rcutree_prepare_cpu() The expedited kthread worker performing the per node initialization is going to be split into per node kthreads. As such, the future per node kthread creation will need to be called from CPU hotplug callbacks instead of an initcall, right beside the per node boost kthread creation. To prepare for that, move the kthread worker creation above rcutree_prepare_cpu() as a first step to make the review smoother for the upcoming modifications. No intended functional change. Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 96 +++++++++++++++++++++++------------------------ 1 file changed, 48 insertions(+), 48 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 0bf6971895194b..9fc71dd712d6e1 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4403,6 +4403,54 @@ rcu_boot_init_percpu_data(int cpu) rcu_boot_init_nocb_percpu_data(rdp); } +#ifdef CONFIG_RCU_EXP_KTHREAD +struct kthread_worker *rcu_exp_gp_kworker; +struct kthread_worker *rcu_exp_par_gp_kworker; + +static void __init rcu_start_exp_gp_kworkers(void) +{ + const char *par_gp_kworker_name = "rcu_exp_par_gp_kthread_worker"; + const char *gp_kworker_name = "rcu_exp_gp_kthread_worker"; + struct sched_param param = { .sched_priority = kthread_prio }; + + rcu_exp_gp_kworker = kthread_create_worker(0, gp_kworker_name); + if (IS_ERR_OR_NULL(rcu_exp_gp_kworker)) { + pr_err("Failed to create %s!\n", gp_kworker_name); + rcu_exp_gp_kworker = NULL; + return; + } + + rcu_exp_par_gp_kworker = kthread_create_worker(0, par_gp_kworker_name); + if (IS_ERR_OR_NULL(rcu_exp_par_gp_kworker)) { + pr_err("Failed to create %s!\n", par_gp_kworker_name); + rcu_exp_par_gp_kworker = NULL; + kthread_destroy_worker(rcu_exp_gp_kworker); + rcu_exp_gp_kworker = NULL; + return; + } + + sched_setscheduler_nocheck(rcu_exp_gp_kworker->task, SCHED_FIFO, ¶m); + sched_setscheduler_nocheck(rcu_exp_par_gp_kworker->task, SCHED_FIFO, + ¶m); +} + +static inline void rcu_alloc_par_gp_wq(void) +{ +} +#else /* !CONFIG_RCU_EXP_KTHREAD */ +struct workqueue_struct *rcu_par_gp_wq; + +static void __init rcu_start_exp_gp_kworkers(void) +{ +} + +static inline void rcu_alloc_par_gp_wq(void) +{ + rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0); + WARN_ON(!rcu_par_gp_wq); +} +#endif /* CONFIG_RCU_EXP_KTHREAD */ + /* * Invoked early in the CPU-online process, when pretty much all services * are available. The incoming CPU is not present. @@ -4740,54 +4788,6 @@ static int rcu_pm_notify(struct notifier_block *self, return NOTIFY_OK; } -#ifdef CONFIG_RCU_EXP_KTHREAD -struct kthread_worker *rcu_exp_gp_kworker; -struct kthread_worker *rcu_exp_par_gp_kworker; - -static void __init rcu_start_exp_gp_kworkers(void) -{ - const char *par_gp_kworker_name = "rcu_exp_par_gp_kthread_worker"; - const char *gp_kworker_name = "rcu_exp_gp_kthread_worker"; - struct sched_param param = { .sched_priority = kthread_prio }; - - rcu_exp_gp_kworker = kthread_create_worker(0, gp_kworker_name); - if (IS_ERR_OR_NULL(rcu_exp_gp_kworker)) { - pr_err("Failed to create %s!\n", gp_kworker_name); - rcu_exp_gp_kworker = NULL; - return; - } - - rcu_exp_par_gp_kworker = kthread_create_worker(0, par_gp_kworker_name); - if (IS_ERR_OR_NULL(rcu_exp_par_gp_kworker)) { - pr_err("Failed to create %s!\n", par_gp_kworker_name); - rcu_exp_par_gp_kworker = NULL; - kthread_destroy_worker(rcu_exp_gp_kworker); - rcu_exp_gp_kworker = NULL; - return; - } - - sched_setscheduler_nocheck(rcu_exp_gp_kworker->task, SCHED_FIFO, ¶m); - sched_setscheduler_nocheck(rcu_exp_par_gp_kworker->task, SCHED_FIFO, - ¶m); -} - -static inline void rcu_alloc_par_gp_wq(void) -{ -} -#else /* !CONFIG_RCU_EXP_KTHREAD */ -struct workqueue_struct *rcu_par_gp_wq; - -static void __init rcu_start_exp_gp_kworkers(void) -{ -} - -static inline void rcu_alloc_par_gp_wq(void) -{ - rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0); - WARN_ON(!rcu_par_gp_wq); -} -#endif /* CONFIG_RCU_EXP_KTHREAD */ - /* * Spawn the kthreads that handle RCU's grace periods. */ From de84f732f404d30643ddcb55c6aad45c88ea10bc Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 12 Jan 2024 16:46:19 +0100 Subject: [PATCH 219/707] rcu/exp: Make parallel exp gp kworker per rcu node When CONFIG_RCU_EXP_KTHREAD=n, the expedited grace period per node initialization is performed in parallel via workqueues (one work per node). However in CONFIG_RCU_EXP_KTHREAD=y, this per node initialization is performed by a single kworker serializing each node initialization (one work for all nodes). The second part is certainly less scalable and efficient beyond a single leaf node. To improve this, expand this single kworker into per-node kworkers. This new layout is eventually intended to remove the workqueues based implementation since it will essentially now become duplicate code. Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu.h | 1 - kernel/rcu/tree.c | 61 +++++++++++++++++++++++++++------------- kernel/rcu/tree.h | 3 ++ kernel/rcu/tree_exp.h | 10 +++---- kernel/rcu/tree_plugin.h | 10 ++----- 5 files changed, 52 insertions(+), 33 deletions(-) diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index dcfb666f24993f..4bc8cd6d461e64 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -625,7 +625,6 @@ void rcu_force_quiescent_state(void); extern struct workqueue_struct *rcu_gp_wq; #ifdef CONFIG_RCU_EXP_KTHREAD extern struct kthread_worker *rcu_exp_gp_kworker; -extern struct kthread_worker *rcu_exp_par_gp_kworker; #else /* !CONFIG_RCU_EXP_KTHREAD */ extern struct workqueue_struct *rcu_par_gp_wq; #endif /* CONFIG_RCU_EXP_KTHREAD */ diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 9fc71dd712d6e1..5371f8fa0ee21b 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4405,33 +4405,39 @@ rcu_boot_init_percpu_data(int cpu) #ifdef CONFIG_RCU_EXP_KTHREAD struct kthread_worker *rcu_exp_gp_kworker; -struct kthread_worker *rcu_exp_par_gp_kworker; -static void __init rcu_start_exp_gp_kworkers(void) +static void rcu_spawn_exp_par_gp_kworker(struct rcu_node *rnp) { - const char *par_gp_kworker_name = "rcu_exp_par_gp_kthread_worker"; - const char *gp_kworker_name = "rcu_exp_gp_kthread_worker"; + struct kthread_worker *kworker; + const char *name = "rcu_exp_par_gp_kthread_worker/%d"; struct sched_param param = { .sched_priority = kthread_prio }; + int rnp_index = rnp - rcu_get_root(); - rcu_exp_gp_kworker = kthread_create_worker(0, gp_kworker_name); - if (IS_ERR_OR_NULL(rcu_exp_gp_kworker)) { - pr_err("Failed to create %s!\n", gp_kworker_name); - rcu_exp_gp_kworker = NULL; + if (rnp->exp_kworker) + return; + + kworker = kthread_create_worker(0, name, rnp_index); + if (IS_ERR_OR_NULL(kworker)) { + pr_err("Failed to create par gp kworker on %d/%d\n", + rnp->grplo, rnp->grphi); return; } + WRITE_ONCE(rnp->exp_kworker, kworker); + sched_setscheduler_nocheck(kworker->task, SCHED_FIFO, ¶m); +} - rcu_exp_par_gp_kworker = kthread_create_worker(0, par_gp_kworker_name); - if (IS_ERR_OR_NULL(rcu_exp_par_gp_kworker)) { - pr_err("Failed to create %s!\n", par_gp_kworker_name); - rcu_exp_par_gp_kworker = NULL; - kthread_destroy_worker(rcu_exp_gp_kworker); +static void __init rcu_start_exp_gp_kworker(void) +{ + const char *name = "rcu_exp_gp_kthread_worker"; + struct sched_param param = { .sched_priority = kthread_prio }; + + rcu_exp_gp_kworker = kthread_create_worker(0, name); + if (IS_ERR_OR_NULL(rcu_exp_gp_kworker)) { + pr_err("Failed to create %s!\n", name); rcu_exp_gp_kworker = NULL; return; } - sched_setscheduler_nocheck(rcu_exp_gp_kworker->task, SCHED_FIFO, ¶m); - sched_setscheduler_nocheck(rcu_exp_par_gp_kworker->task, SCHED_FIFO, - ¶m); } static inline void rcu_alloc_par_gp_wq(void) @@ -4440,7 +4446,11 @@ static inline void rcu_alloc_par_gp_wq(void) #else /* !CONFIG_RCU_EXP_KTHREAD */ struct workqueue_struct *rcu_par_gp_wq; -static void __init rcu_start_exp_gp_kworkers(void) +static void rcu_spawn_exp_par_gp_kworker(struct rcu_node *rnp) +{ +} + +static void __init rcu_start_exp_gp_kworker(void) { } @@ -4451,6 +4461,17 @@ static inline void rcu_alloc_par_gp_wq(void) } #endif /* CONFIG_RCU_EXP_KTHREAD */ +static void rcu_spawn_rnp_kthreads(struct rcu_node *rnp) +{ + if ((IS_ENABLED(CONFIG_RCU_EXP_KTHREAD) || + IS_ENABLED(CONFIG_RCU_BOOST)) && rcu_scheduler_fully_active) { + mutex_lock(&rnp->kthread_mutex); + rcu_spawn_one_boost_kthread(rnp); + rcu_spawn_exp_par_gp_kworker(rnp); + mutex_unlock(&rnp->kthread_mutex); + } +} + /* * Invoked early in the CPU-online process, when pretty much all services * are available. The incoming CPU is not present. @@ -4499,7 +4520,7 @@ int rcutree_prepare_cpu(unsigned int cpu) rdp->rcu_iw_gp_seq = rdp->gp_seq - 1; trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuonl")); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - rcu_spawn_one_boost_kthread(rnp); + rcu_spawn_rnp_kthreads(rnp); rcu_spawn_cpu_nocb_kthread(cpu); WRITE_ONCE(rcu_state.n_online_cpus, rcu_state.n_online_cpus + 1); @@ -4822,10 +4843,10 @@ static int __init rcu_spawn_gp_kthread(void) * due to rcu_scheduler_fully_active. */ rcu_spawn_cpu_nocb_kthread(smp_processor_id()); - rcu_spawn_one_boost_kthread(rdp->mynode); + rcu_spawn_rnp_kthreads(rdp->mynode); rcu_spawn_core_kthreads(); /* Create kthread worker for expedited GPs */ - rcu_start_exp_gp_kworkers(); + rcu_start_exp_gp_kworker(); return 0; } early_initcall(rcu_spawn_gp_kthread); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index adf8609f27d038..a0c02d631f448e 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -72,6 +72,9 @@ struct rcu_node { /* Online CPUs for next expedited GP. */ /* Any CPU that has ever been online will */ /* have its bit set. */ + struct kthread_worker *exp_kworker; + /* Workers performing per node expedited GP */ + /* initialization. */ unsigned long cbovldmask; /* CPUs experiencing callback overload. */ unsigned long ffmask; /* Fully functional CPUs. */ diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 6123a60d9a4d7c..0318a8a062d5c0 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -432,9 +432,9 @@ static inline bool rcu_exp_worker_started(void) return !!READ_ONCE(rcu_exp_gp_kworker); } -static inline bool rcu_exp_par_worker_started(void) +static inline bool rcu_exp_par_worker_started(struct rcu_node *rnp) { - return !!READ_ONCE(rcu_exp_par_gp_kworker); + return !!READ_ONCE(rnp->exp_kworker); } static inline void sync_rcu_exp_select_cpus_queue_work(struct rcu_node *rnp) @@ -445,7 +445,7 @@ static inline void sync_rcu_exp_select_cpus_queue_work(struct rcu_node *rnp) * another work item on the same kthread worker can result in * deadlock. */ - kthread_queue_work(rcu_exp_par_gp_kworker, &rnp->rew.rew_work); + kthread_queue_work(READ_ONCE(rnp->exp_kworker), &rnp->rew.rew_work); } static inline void sync_rcu_exp_select_cpus_flush_work(struct rcu_node *rnp) @@ -487,7 +487,7 @@ static inline bool rcu_exp_worker_started(void) return !!READ_ONCE(rcu_gp_wq); } -static inline bool rcu_exp_par_worker_started(void) +static inline bool rcu_exp_par_worker_started(struct rcu_node *rnp) { return !!READ_ONCE(rcu_par_gp_wq); } @@ -550,7 +550,7 @@ static void sync_rcu_exp_select_cpus(void) rnp->exp_need_flush = false; if (!READ_ONCE(rnp->expmask)) continue; /* Avoid early boot non-existent wq. */ - if (!rcu_exp_par_worker_started() || + if (!rcu_exp_par_worker_started(rnp) || rcu_scheduler_active != RCU_SCHEDULER_RUNNING || rcu_is_last_leaf_node(rnp)) { /* No worker started yet or last leaf, do direct call. */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 0d307674915c60..09bdd36ca9ffcc 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1195,14 +1195,13 @@ static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp) struct sched_param sp; struct task_struct *t; - mutex_lock(&rnp->kthread_mutex); - if (rnp->boost_kthread_task || !rcu_scheduler_fully_active) - goto out; + if (rnp->boost_kthread_task) + return; t = kthread_create(rcu_boost_kthread, (void *)rnp, "rcub/%d", rnp_index); if (WARN_ON_ONCE(IS_ERR(t))) - goto out; + return; raw_spin_lock_irqsave_rcu_node(rnp, flags); rnp->boost_kthread_task = t; @@ -1210,9 +1209,6 @@ static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp) sp.sched_priority = kthread_prio; sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */ - - out: - mutex_unlock(&rnp->kthread_mutex); } /* From a48606b1f7c8d1dbc53c0a980b1303798e5b8177 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 12 Jan 2024 16:46:20 +0100 Subject: [PATCH 220/707] rcu/exp: Handle parallel exp gp kworkers affinity Affine the parallel expedited gp kworkers to their respective RCU node in order to make them close to the cache their are playing with. This reuses the boost kthreads machinery that probe into CPU hotplug operations such that the kthreads become/stay affine to their respective node as soon/long as they contain online CPUs. Otherwise and if the current CPU going down was the last online on the leaf node, the related kthread is affine to the housekeeping CPUs. In the long run, this affinity VS CPU hotplug operation game should probably be implemented at the generic kthread level. Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 79 +++++++++++++++++++++++++++++++++++++--- kernel/rcu/tree_plugin.h | 42 ++------------------- 2 files changed, 78 insertions(+), 43 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 5371f8fa0ee21b..40bfc58f18213d 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -145,7 +145,7 @@ static int rcu_scheduler_fully_active __read_mostly; static void rcu_report_qs_rnp(unsigned long mask, struct rcu_node *rnp, unsigned long gps, unsigned long flags); -static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); +static struct task_struct *rcu_boost_task(struct rcu_node *rnp); static void invoke_rcu_core(void); static void rcu_report_exp_rdp(struct rcu_data *rdp); static void sync_sched_exp_online_cleanup(int cpu); @@ -4426,6 +4426,16 @@ static void rcu_spawn_exp_par_gp_kworker(struct rcu_node *rnp) sched_setscheduler_nocheck(kworker->task, SCHED_FIFO, ¶m); } +static struct task_struct *rcu_exp_par_gp_task(struct rcu_node *rnp) +{ + struct kthread_worker *kworker = READ_ONCE(rnp->exp_kworker); + + if (!kworker) + return NULL; + + return kworker->task; +} + static void __init rcu_start_exp_gp_kworker(void) { const char *name = "rcu_exp_gp_kthread_worker"; @@ -4450,6 +4460,11 @@ static void rcu_spawn_exp_par_gp_kworker(struct rcu_node *rnp) { } +static struct task_struct *rcu_exp_par_gp_task(struct rcu_node *rnp) +{ + return NULL; +} + static void __init rcu_start_exp_gp_kworker(void) { } @@ -4528,13 +4543,67 @@ int rcutree_prepare_cpu(unsigned int cpu) } /* - * Update RCU priority boot kthread affinity for CPU-hotplug changes. + * Update kthreads affinity during CPU-hotplug changes. + * + * Set the per-rcu_node kthread's affinity to cover all CPUs that are + * served by the rcu_node in question. The CPU hotplug lock is still + * held, so the value of rnp->qsmaskinit will be stable. + * + * We don't include outgoingcpu in the affinity set, use -1 if there is + * no outgoing CPU. If there are no CPUs left in the affinity set, + * this function allows the kthread to execute on any CPU. + * + * Any future concurrent calls are serialized via ->kthread_mutex. */ -static void rcutree_affinity_setting(unsigned int cpu, int outgoing) +static void rcutree_affinity_setting(unsigned int cpu, int outgoingcpu) { - struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + cpumask_var_t cm; + unsigned long mask; + struct rcu_data *rdp; + struct rcu_node *rnp; + struct task_struct *task_boost, *task_exp; + + if (!IS_ENABLED(CONFIG_RCU_EXP_KTHREAD) && !IS_ENABLED(CONFIG_RCU_BOOST)) + return; + + rdp = per_cpu_ptr(&rcu_data, cpu); + rnp = rdp->mynode; + + task_boost = rcu_boost_task(rnp); + task_exp = rcu_exp_par_gp_task(rnp); + + /* + * If CPU is the boot one, those tasks are created later from early + * initcall since kthreadd must be created first. + */ + if (!task_boost && !task_exp) + return; + + if (!zalloc_cpumask_var(&cm, GFP_KERNEL)) + return; + + mutex_lock(&rnp->kthread_mutex); + mask = rcu_rnp_online_cpus(rnp); + for_each_leaf_node_possible_cpu(rnp, cpu) + if ((mask & leaf_node_cpu_bit(rnp, cpu)) && + cpu != outgoingcpu) + cpumask_set_cpu(cpu, cm); + cpumask_and(cm, cm, housekeeping_cpumask(HK_TYPE_RCU)); + if (cpumask_empty(cm)) { + cpumask_copy(cm, housekeeping_cpumask(HK_TYPE_RCU)); + if (outgoingcpu >= 0) + cpumask_clear_cpu(outgoingcpu, cm); + } + + if (task_exp) + set_cpus_allowed_ptr(task_exp, cm); + + if (task_boost) + set_cpus_allowed_ptr(task_boost, cm); + + mutex_unlock(&rnp->kthread_mutex); - rcu_boost_kthread_setaffinity(rdp->mynode, outgoing); + free_cpumask_var(cm); } /* diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 09bdd36ca9ffcc..08246cca663f0f 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1211,43 +1211,9 @@ static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp) wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */ } -/* - * Set the per-rcu_node kthread's affinity to cover all CPUs that are - * served by the rcu_node in question. The CPU hotplug lock is still - * held, so the value of rnp->qsmaskinit will be stable. - * - * We don't include outgoingcpu in the affinity set, use -1 if there is - * no outgoing CPU. If there are no CPUs left in the affinity set, - * this function allows the kthread to execute on any CPU. - * - * Any future concurrent calls are serialized via ->kthread_mutex. - */ -static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) +static struct task_struct *rcu_boost_task(struct rcu_node *rnp) { - struct task_struct *t = rnp->boost_kthread_task; - unsigned long mask; - cpumask_var_t cm; - int cpu; - - if (!t) - return; - if (!zalloc_cpumask_var(&cm, GFP_KERNEL)) - return; - mutex_lock(&rnp->kthread_mutex); - mask = rcu_rnp_online_cpus(rnp); - for_each_leaf_node_possible_cpu(rnp, cpu) - if ((mask & leaf_node_cpu_bit(rnp, cpu)) && - cpu != outgoingcpu) - cpumask_set_cpu(cpu, cm); - cpumask_and(cm, cm, housekeeping_cpumask(HK_TYPE_RCU)); - if (cpumask_empty(cm)) { - cpumask_copy(cm, housekeeping_cpumask(HK_TYPE_RCU)); - if (outgoingcpu >= 0) - cpumask_clear_cpu(outgoingcpu, cm); - } - set_cpus_allowed_ptr(t, cm); - mutex_unlock(&rnp->kthread_mutex); - free_cpumask_var(cm); + return READ_ONCE(rnp->boost_kthread_task); } #else /* #ifdef CONFIG_RCU_BOOST */ @@ -1266,10 +1232,10 @@ static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp) { } -static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) +static struct task_struct * rcu_boost_task(struct rcu_node *rnp) { + return NULL; } - #endif /* #else #ifdef CONFIG_RCU_BOOST */ /* From 5c2905e35a3aa9a65e7f651a6a07a2bd2eb9449a Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 12 Jan 2024 16:46:21 +0100 Subject: [PATCH 221/707] rcu/exp: Remove rcu_par_gp_wq TREE04 running on short iterations can produce writer stalls of the following kind: ??? Writer stall state RTWS_EXP_SYNC(4) g3968 f0x0 ->state 0x2 cpu 0 task:rcu_torture_wri state:D stack:14568 pid:83 ppid:2 flags:0x00004000 Call Trace: __schedule+0x2de/0x850 ? trace_event_raw_event_rcu_exp_funnel_lock+0x6d/0xb0 schedule+0x4f/0x90 synchronize_rcu_expedited+0x430/0x670 ? __pfx_autoremove_wake_function+0x10/0x10 ? __pfx_synchronize_rcu_expedited+0x10/0x10 do_rtws_sync.constprop.0+0xde/0x230 rcu_torture_writer+0x4b4/0xcd0 ? __pfx_rcu_torture_writer+0x10/0x10 kthread+0xc7/0xf0 ? __pfx_kthread+0x10/0x10 ret_from_fork+0x2f/0x50 ? __pfx_kthread+0x10/0x10 ret_from_fork_asm+0x1b/0x30 Waiting for an expedited grace period and polling for an expedited grace period both are operations that internally rely on the same workqueue performing necessary asynchronous work. However, a dependency chain is involved between those two operations, as depicted below: ====== CPU 0 ======= ====== CPU 1 ======= synchronize_rcu_expedited() exp_funnel_lock() mutex_lock(&rcu_state.exp_mutex); start_poll_synchronize_rcu_expedited queue_work(rcu_gp_wq, &rnp->exp_poll_wq); synchronize_rcu_expedited_queue_work() queue_work(rcu_gp_wq, &rew->rew_work); wait_event() // A, wait for &rew->rew_work completion mutex_unlock() // B //======> switch to kworker sync_rcu_do_polled_gp() { synchronize_rcu_expedited() exp_funnel_lock() mutex_lock(&rcu_state.exp_mutex); // C, wait B .... } // D Since workqueues are usually implemented on top of several kworkers handling the queue concurrently, the above situation wouldn't deadlock most of the time because A then doesn't depend on D. But in case of memory stress, a single kworker may end up handling alone all the works in a serialized way. In that case the above layout becomes a problem because A then waits for D, closing a circular dependency: A -> D -> C -> B -> A This however only happens when CONFIG_RCU_EXP_KTHREAD=n. Indeed synchronize_rcu_expedited() is otherwise implemented on top of a kthread worker while polling still relies on rcu_gp_wq workqueue, breaking the above circular dependency chain. Fix this with making expedited grace period to always rely on kthread worker. The workqueue based implementation is essentially a duplicate anyway now that the per-node initialization is performed by per-node kthread workers. Meanwhile the CONFIG_RCU_EXP_KTHREAD switch is still kept around to manage the scheduler policy of these kthread workers. Reported-by: Anna-Maria Behnsen Reported-by: Thomas Gleixner Suggested-by: Joel Fernandes Suggested-by: Paul E. McKenney Suggested-by: Neeraj upadhyay Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu.h | 4 --- kernel/rcu/tree.c | 40 ++++-------------------- kernel/rcu/tree.h | 6 +--- kernel/rcu/tree_exp.h | 73 +------------------------------------------ 4 files changed, 8 insertions(+), 115 deletions(-) diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 4bc8cd6d461e64..4e65a92e528e5c 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -623,11 +623,7 @@ int rcu_get_gp_kthreads_prio(void); void rcu_fwd_progress_check(unsigned long j); void rcu_force_quiescent_state(void); extern struct workqueue_struct *rcu_gp_wq; -#ifdef CONFIG_RCU_EXP_KTHREAD extern struct kthread_worker *rcu_exp_gp_kworker; -#else /* !CONFIG_RCU_EXP_KTHREAD */ -extern struct workqueue_struct *rcu_par_gp_wq; -#endif /* CONFIG_RCU_EXP_KTHREAD */ void rcu_gp_slow_register(atomic_t *rgssp); void rcu_gp_slow_unregister(atomic_t *rgssp); #endif /* #else #ifdef CONFIG_TINY_RCU */ diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 40bfc58f18213d..c8980d76f40292 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4403,7 +4403,6 @@ rcu_boot_init_percpu_data(int cpu) rcu_boot_init_nocb_percpu_data(rdp); } -#ifdef CONFIG_RCU_EXP_KTHREAD struct kthread_worker *rcu_exp_gp_kworker; static void rcu_spawn_exp_par_gp_kworker(struct rcu_node *rnp) @@ -4423,7 +4422,9 @@ static void rcu_spawn_exp_par_gp_kworker(struct rcu_node *rnp) return; } WRITE_ONCE(rnp->exp_kworker, kworker); - sched_setscheduler_nocheck(kworker->task, SCHED_FIFO, ¶m); + + if (IS_ENABLED(CONFIG_RCU_EXP_KTHREAD)) + sched_setscheduler_nocheck(kworker->task, SCHED_FIFO, ¶m); } static struct task_struct *rcu_exp_par_gp_task(struct rcu_node *rnp) @@ -4447,39 +4448,14 @@ static void __init rcu_start_exp_gp_kworker(void) rcu_exp_gp_kworker = NULL; return; } - sched_setscheduler_nocheck(rcu_exp_gp_kworker->task, SCHED_FIFO, ¶m); -} - -static inline void rcu_alloc_par_gp_wq(void) -{ -} -#else /* !CONFIG_RCU_EXP_KTHREAD */ -struct workqueue_struct *rcu_par_gp_wq; - -static void rcu_spawn_exp_par_gp_kworker(struct rcu_node *rnp) -{ -} - -static struct task_struct *rcu_exp_par_gp_task(struct rcu_node *rnp) -{ - return NULL; -} - -static void __init rcu_start_exp_gp_kworker(void) -{ -} -static inline void rcu_alloc_par_gp_wq(void) -{ - rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0); - WARN_ON(!rcu_par_gp_wq); + if (IS_ENABLED(CONFIG_RCU_EXP_KTHREAD)) + sched_setscheduler_nocheck(rcu_exp_gp_kworker->task, SCHED_FIFO, ¶m); } -#endif /* CONFIG_RCU_EXP_KTHREAD */ static void rcu_spawn_rnp_kthreads(struct rcu_node *rnp) { - if ((IS_ENABLED(CONFIG_RCU_EXP_KTHREAD) || - IS_ENABLED(CONFIG_RCU_BOOST)) && rcu_scheduler_fully_active) { + if (rcu_scheduler_fully_active) { mutex_lock(&rnp->kthread_mutex); rcu_spawn_one_boost_kthread(rnp); rcu_spawn_exp_par_gp_kworker(rnp); @@ -4563,9 +4539,6 @@ static void rcutree_affinity_setting(unsigned int cpu, int outgoingcpu) struct rcu_node *rnp; struct task_struct *task_boost, *task_exp; - if (!IS_ENABLED(CONFIG_RCU_EXP_KTHREAD) && !IS_ENABLED(CONFIG_RCU_BOOST)) - return; - rdp = per_cpu_ptr(&rcu_data, cpu); rnp = rdp->mynode; @@ -5255,7 +5228,6 @@ void __init rcu_init(void) /* Create workqueue for Tree SRCU and for expedited GPs. */ rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0); WARN_ON(!rcu_gp_wq); - rcu_alloc_par_gp_wq(); /* Fill in default value for rcutree.qovld boot parameter. */ /* -After- the rcu_node ->lock fields are initialized! */ diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index a0c02d631f448e..df48160b3136dc 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -21,14 +21,10 @@ #include "rcu_segcblist.h" -/* Communicate arguments to a workqueue handler. */ +/* Communicate arguments to a kthread worker handler. */ struct rcu_exp_work { unsigned long rew_s; -#ifdef CONFIG_RCU_EXP_KTHREAD struct kthread_work rew_work; -#else - struct work_struct rew_work; -#endif /* CONFIG_RCU_EXP_KTHREAD */ }; /* RCU's kthread states for tracing. */ diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 0318a8a062d5c0..6b83537480b12f 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -418,7 +418,6 @@ static void __sync_rcu_exp_select_node_cpus(struct rcu_exp_work *rewp) static void rcu_exp_sel_wait_wake(unsigned long s); -#ifdef CONFIG_RCU_EXP_KTHREAD static void sync_rcu_exp_select_node_cpus(struct kthread_work *wp) { struct rcu_exp_work *rewp = @@ -470,69 +469,6 @@ static inline void synchronize_rcu_expedited_queue_work(struct rcu_exp_work *rew kthread_queue_work(rcu_exp_gp_kworker, &rew->rew_work); } -static inline void synchronize_rcu_expedited_destroy_work(struct rcu_exp_work *rew) -{ -} -#else /* !CONFIG_RCU_EXP_KTHREAD */ -static void sync_rcu_exp_select_node_cpus(struct work_struct *wp) -{ - struct rcu_exp_work *rewp = - container_of(wp, struct rcu_exp_work, rew_work); - - __sync_rcu_exp_select_node_cpus(rewp); -} - -static inline bool rcu_exp_worker_started(void) -{ - return !!READ_ONCE(rcu_gp_wq); -} - -static inline bool rcu_exp_par_worker_started(struct rcu_node *rnp) -{ - return !!READ_ONCE(rcu_par_gp_wq); -} - -static inline void sync_rcu_exp_select_cpus_queue_work(struct rcu_node *rnp) -{ - int cpu = find_next_bit(&rnp->ffmask, BITS_PER_LONG, -1); - - INIT_WORK(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus); - /* If all offline, queue the work on an unbound CPU. */ - if (unlikely(cpu > rnp->grphi - rnp->grplo)) - cpu = WORK_CPU_UNBOUND; - else - cpu += rnp->grplo; - queue_work_on(cpu, rcu_par_gp_wq, &rnp->rew.rew_work); -} - -static inline void sync_rcu_exp_select_cpus_flush_work(struct rcu_node *rnp) -{ - flush_work(&rnp->rew.rew_work); -} - -/* - * Work-queue handler to drive an expedited grace period forward. - */ -static void wait_rcu_exp_gp(struct work_struct *wp) -{ - struct rcu_exp_work *rewp; - - rewp = container_of(wp, struct rcu_exp_work, rew_work); - rcu_exp_sel_wait_wake(rewp->rew_s); -} - -static inline void synchronize_rcu_expedited_queue_work(struct rcu_exp_work *rew) -{ - INIT_WORK_ONSTACK(&rew->rew_work, wait_rcu_exp_gp); - queue_work(rcu_gp_wq, &rew->rew_work); -} - -static inline void synchronize_rcu_expedited_destroy_work(struct rcu_exp_work *rew) -{ - destroy_work_on_stack(&rew->rew_work); -} -#endif /* CONFIG_RCU_EXP_KTHREAD */ - /* * Select the nodes that the upcoming expedited grace period needs * to wait for. @@ -965,7 +901,6 @@ static void rcu_exp_print_detail_task_stall_rnp(struct rcu_node *rnp) */ void synchronize_rcu_expedited(void) { - bool use_worker; unsigned long flags; struct rcu_exp_work rew; struct rcu_node *rnp; @@ -976,9 +911,6 @@ void synchronize_rcu_expedited(void) lock_is_held(&rcu_sched_lock_map), "Illegal synchronize_rcu_expedited() in RCU read-side critical section"); - use_worker = (rcu_scheduler_active != RCU_SCHEDULER_INIT) && - rcu_exp_worker_started(); - /* Is the state is such that the call is a grace period? */ if (rcu_blocking_is_gp()) { // Note well that this code runs with !PREEMPT && !SMP. @@ -1008,7 +940,7 @@ void synchronize_rcu_expedited(void) return; /* Someone else did our work for us. */ /* Ensure that load happens before action based on it. */ - if (unlikely(!use_worker)) { + if (unlikely((rcu_scheduler_active == RCU_SCHEDULER_INIT) || !rcu_exp_worker_started())) { /* Direct call during scheduler init and early_initcalls(). */ rcu_exp_sel_wait_wake(s); } else { @@ -1025,9 +957,6 @@ void synchronize_rcu_expedited(void) /* Let the next expedited grace period start. */ mutex_unlock(&rcu_state.exp_mutex); - - if (likely(use_worker)) - synchronize_rcu_expedited_destroy_work(&rew); } EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); From fd3b5e371b1b8aa5bb050ba5bc2839feb58d95b9 Mon Sep 17 00:00:00 2001 From: Onkarnath Date: Thu, 11 Jan 2024 14:57:22 +0530 Subject: [PATCH 222/707] rcu/sync: remove un-used rcu_sync_enter_start function With commit '6a010a49b63a ("cgroup: Make !percpu threadgroup_rwsem operations optional")' usage of rcu_sync_enter_start is removed. So this function can also be removed. In the words of Oleg Nesterov: __rcu_sync_enter(wait => false) is a better alternative if someone needs rcu_sync_enter_start() again. Link: https://lore.kernel.org/all/20220725121208.GB28662@redhat.com/ Signed-off-by: Onkarnath Signed-off-by: Maninder Singh Acked-by: Oleg Nesterov Acked-by: Tejun Heo Signed-off-by: Paul E. McKenney --- include/linux/rcu_sync.h | 1 - kernel/rcu/sync.c | 16 ---------------- 2 files changed, 17 deletions(-) diff --git a/include/linux/rcu_sync.h b/include/linux/rcu_sync.h index 0027d4c8087c9a..3860dbb9107a21 100644 --- a/include/linux/rcu_sync.h +++ b/include/linux/rcu_sync.h @@ -37,7 +37,6 @@ static inline bool rcu_sync_is_idle(struct rcu_sync *rsp) } extern void rcu_sync_init(struct rcu_sync *); -extern void rcu_sync_enter_start(struct rcu_sync *); extern void rcu_sync_enter(struct rcu_sync *); extern void rcu_sync_exit(struct rcu_sync *); extern void rcu_sync_dtor(struct rcu_sync *); diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c index e550f97779b8dc..86df878a2fee8b 100644 --- a/kernel/rcu/sync.c +++ b/kernel/rcu/sync.c @@ -24,22 +24,6 @@ void rcu_sync_init(struct rcu_sync *rsp) init_waitqueue_head(&rsp->gp_wait); } -/** - * rcu_sync_enter_start - Force readers onto slow path for multiple updates - * @rsp: Pointer to rcu_sync structure to use for synchronization - * - * Must be called after rcu_sync_init() and before first use. - * - * Ensures rcu_sync_is_idle() returns false and rcu_sync_{enter,exit}() - * pairs turn into NO-OPs. - */ -void rcu_sync_enter_start(struct rcu_sync *rsp) -{ - rsp->gp_count++; - rsp->gp_state = GP_PASSED; -} - - static void rcu_sync_func(struct rcu_head *rhp); static void rcu_sync_call(struct rcu_sync *rsp) From 7b4989d7fbc92d7ff1d317e1293bea4a47aab18a Mon Sep 17 00:00:00 2001 From: Zqiang Date: Wed, 17 Jan 2024 18:26:16 +0800 Subject: [PATCH 223/707] rcu/nocb: Check rdp_gp->nocb_timer in __call_rcu_nocb_wake() Currently, only rdp_gp->nocb_timer is used, for nocb_timer of no-rdp_gp structure, the timer_pending() is always return false, this commit therefore need to check rdp_gp->nocb_timer in __call_rcu_nocb_wake(). Signed-off-by: Zqiang Reviewed-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_nocb.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index 54971afc3a9b25..3f85577bddd4ef 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -564,6 +564,7 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone, long lazy_len; long len; struct task_struct *t; + struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; // If we are being polled or there is no kthread, just leave. t = READ_ONCE(rdp->nocb_gp_kthread); @@ -608,7 +609,7 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone, smp_mb(); /* Enqueue before timer_pending(). */ if ((rdp->nocb_cb_sleep || !rcu_segcblist_ready_cbs(&rdp->cblist)) && - !timer_pending(&rdp->nocb_timer)) { + !timer_pending(&rdp_gp->nocb_timer)) { rcu_nocb_unlock(rdp); wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_FORCE, TPS("WakeOvfIsDeferred")); From bc31e6cb27a9334140ff2f0a209d59b08bc0bc8c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 20 Jan 2024 07:07:08 -0800 Subject: [PATCH 224/707] rcu-tasks: Eliminate deadlocks involving do_exit() and RCU tasks Holding a mutex across synchronize_rcu_tasks() and acquiring that same mutex in code called from do_exit() after its call to exit_tasks_rcu_start() but before its call to exit_tasks_rcu_stop() results in deadlock. This is by design, because tasks that are far enough into do_exit() are no longer present on the tasks list, making it a bit difficult for RCU Tasks to find them, let alone wait on them to do a voluntary context switch. However, such deadlocks are becoming more frequent. In addition, lockdep currently does not detect such deadlocks and they can be difficult to reproduce. In addition, if a task voluntarily context switches during that time (for example, if it blocks acquiring a mutex), then this task is in an RCU Tasks quiescent state. And with some adjustments, RCU Tasks could just as well take advantage of that fact. This commit therefore eliminates these deadlock by replacing the SRCU-based wait for do_exit() completion with per-CPU lists of tasks currently exiting. A given task will be on one of these per-CPU lists for the same period of time that this task would previously have been in the previous SRCU read-side critical section. These lists enable RCU Tasks to find the tasks that have already been removed from the tasks list, but that must nevertheless be waited upon. The RCU Tasks grace period gathers any of these do_exit() tasks that it must wait on, and adds them to the list of holdouts. Per-CPU locking and get_task_struct() are used to synchronize addition to and removal from these lists. Link: https://lore.kernel.org/all/20240118021842.290665-1-chenzhongjin@huawei.com/ Reported-by: Chen Zhongjin Signed-off-by: Paul E. McKenney --- include/linux/sched.h | 2 + init/init_task.c | 1 + kernel/fork.c | 1 + kernel/rcu/tasks.h | 89 ++++++++++++++++++++++++++++++------------- 4 files changed, 67 insertions(+), 26 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index cdb8ea53c365ba..4f0e9274da2de4 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -858,6 +858,8 @@ struct task_struct { u8 rcu_tasks_idx; int rcu_tasks_idle_cpu; struct list_head rcu_tasks_holdout_list; + int rcu_tasks_exit_cpu; + struct list_head rcu_tasks_exit_list; #endif /* #ifdef CONFIG_TASKS_RCU */ #ifdef CONFIG_TASKS_TRACE_RCU diff --git a/init/init_task.c b/init/init_task.c index 7ecb458eb3da60..4daee6d761c86c 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -147,6 +147,7 @@ struct task_struct init_task __aligned(L1_CACHE_BYTES) = { .rcu_tasks_holdout = false, .rcu_tasks_holdout_list = LIST_HEAD_INIT(init_task.rcu_tasks_holdout_list), .rcu_tasks_idle_cpu = -1, + .rcu_tasks_exit_list = LIST_HEAD_INIT(init_task.rcu_tasks_exit_list), #endif #ifdef CONFIG_TASKS_TRACE_RCU .trc_reader_nesting = 0, diff --git a/kernel/fork.c b/kernel/fork.c index 47ff3b35352e0b..3eb86f30e66418 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1975,6 +1975,7 @@ static inline void rcu_copy_process(struct task_struct *p) p->rcu_tasks_holdout = false; INIT_LIST_HEAD(&p->rcu_tasks_holdout_list); p->rcu_tasks_idle_cpu = -1; + INIT_LIST_HEAD(&p->rcu_tasks_exit_list); #endif /* #ifdef CONFIG_TASKS_RCU */ #ifdef CONFIG_TASKS_TRACE_RCU p->trc_reader_nesting = 0; diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 732ad5b39946a5..bd4a51fd5b1fba 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -32,6 +32,7 @@ typedef void (*postgp_func_t)(struct rcu_tasks *rtp); * @rtp_irq_work: IRQ work queue for deferred wakeups. * @barrier_q_head: RCU callback for barrier operation. * @rtp_blkd_tasks: List of tasks blocked as readers. + * @rtp_exit_list: List of tasks in the latter portion of do_exit(). * @cpu: CPU number corresponding to this entry. * @rtpp: Pointer to the rcu_tasks structure. */ @@ -46,6 +47,7 @@ struct rcu_tasks_percpu { struct irq_work rtp_irq_work; struct rcu_head barrier_q_head; struct list_head rtp_blkd_tasks; + struct list_head rtp_exit_list; int cpu; struct rcu_tasks *rtpp; }; @@ -144,8 +146,6 @@ static struct rcu_tasks rt_name = \ } #ifdef CONFIG_TASKS_RCU -/* Track exiting tasks in order to allow them to be waited for. */ -DEFINE_STATIC_SRCU(tasks_rcu_exit_srcu); /* Report delay in synchronize_srcu() completion in rcu_tasks_postscan(). */ static void tasks_rcu_exit_srcu_stall(struct timer_list *unused); @@ -275,6 +275,8 @@ static void cblist_init_generic(struct rcu_tasks *rtp) rtpcp->rtpp = rtp; if (!rtpcp->rtp_blkd_tasks.next) INIT_LIST_HEAD(&rtpcp->rtp_blkd_tasks); + if (!rtpcp->rtp_exit_list.next) + INIT_LIST_HEAD(&rtpcp->rtp_exit_list); } pr_info("%s: Setting shift to %d and lim to %d rcu_task_cb_adjust=%d.\n", rtp->name, @@ -851,10 +853,12 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp) // number of voluntary context switches, and add that task to the // holdout list. // rcu_tasks_postscan(): -// Invoke synchronize_srcu() to ensure that all tasks that were -// in the process of exiting (and which thus might not know to -// synchronize with this RCU Tasks grace period) have completed -// exiting. +// Gather per-CPU lists of tasks in do_exit() to ensure that all +// tasks that were in the process of exiting (and which thus might +// not know to synchronize with this RCU Tasks grace period) have +// completed exiting. The synchronize_rcu() in rcu_tasks_postgp() +// will take care of any tasks stuck in the non-preemptible region +// of do_exit() following its call to exit_tasks_rcu_stop(). // check_all_holdout_tasks(), repeatedly until holdout list is empty: // Scans the holdout list, attempting to identify a quiescent state // for each task on the list. If there is a quiescent state, the @@ -867,8 +871,10 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp) // with interrupts disabled. // // For each exiting task, the exit_tasks_rcu_start() and -// exit_tasks_rcu_finish() functions begin and end, respectively, the SRCU -// read-side critical sections waited for by rcu_tasks_postscan(). +// exit_tasks_rcu_finish() functions add and remove, respectively, the +// current task to a per-CPU list of tasks that rcu_tasks_postscan() must +// wait on. This is necessary because rcu_tasks_postscan() must wait on +// tasks that have already been removed from the global list of tasks. // // Pre-grace-period update-side code is ordered before the grace // via the raw_spin_lock.*rcu_node(). Pre-grace-period read-side code @@ -932,9 +938,13 @@ static void rcu_tasks_pertask(struct task_struct *t, struct list_head *hop) } } +void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func); +DEFINE_RCU_TASKS(rcu_tasks, rcu_tasks_wait_gp, call_rcu_tasks, "RCU Tasks"); + /* Processing between scanning taskslist and draining the holdout list. */ static void rcu_tasks_postscan(struct list_head *hop) { + int cpu; int rtsi = READ_ONCE(rcu_task_stall_info); if (!IS_ENABLED(CONFIG_TINY_RCU)) { @@ -948,9 +958,9 @@ static void rcu_tasks_postscan(struct list_head *hop) * this, divide the fragile exit path part in two intersecting * read side critical sections: * - * 1) An _SRCU_ read side starting before calling exit_notify(), - * which may remove the task from the tasklist, and ending after - * the final preempt_disable() call in do_exit(). + * 1) A task_struct list addition before calling exit_notify(), + * which may remove the task from the tasklist, with the + * removal after the final preempt_disable() call in do_exit(). * * 2) An _RCU_ read side starting with the final preempt_disable() * call in do_exit() and ending with the final call to schedule() @@ -959,7 +969,18 @@ static void rcu_tasks_postscan(struct list_head *hop) * This handles the part 1). And postgp will handle part 2) with a * call to synchronize_rcu(). */ - synchronize_srcu(&tasks_rcu_exit_srcu); + + for_each_possible_cpu(cpu) { + unsigned long flags; + struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rcu_tasks.rtpcpu, cpu); + struct task_struct *t; + + raw_spin_lock_irqsave_rcu_node(rtpcp, flags); + list_for_each_entry(t, &rtpcp->rtp_exit_list, rcu_tasks_exit_list) + if (list_empty(&t->rcu_tasks_holdout_list)) + rcu_tasks_pertask(t, hop); + raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); + } if (!IS_ENABLED(CONFIG_TINY_RCU)) del_timer_sync(&tasks_rcu_exit_srcu_stall_timer); @@ -1027,7 +1048,6 @@ static void rcu_tasks_postgp(struct rcu_tasks *rtp) * * In addition, this synchronize_rcu() waits for exiting tasks * to complete their final preempt_disable() region of execution, - * cleaning up after synchronize_srcu(&tasks_rcu_exit_srcu), * enforcing the whole region before tasklist removal until * the final schedule() with TASK_DEAD state to be an RCU TASKS * read side critical section. @@ -1035,9 +1055,6 @@ static void rcu_tasks_postgp(struct rcu_tasks *rtp) synchronize_rcu(); } -void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func); -DEFINE_RCU_TASKS(rcu_tasks, rcu_tasks_wait_gp, call_rcu_tasks, "RCU Tasks"); - static void tasks_rcu_exit_srcu_stall(struct timer_list *unused) { #ifndef CONFIG_TINY_RCU @@ -1147,25 +1164,45 @@ struct task_struct *get_rcu_tasks_gp_kthread(void) EXPORT_SYMBOL_GPL(get_rcu_tasks_gp_kthread); /* - * Contribute to protect against tasklist scan blind spot while the - * task is exiting and may be removed from the tasklist. See - * corresponding synchronize_srcu() for further details. + * Protect against tasklist scan blind spot while the task is exiting and + * may be removed from the tasklist. Do this by adding the task to yet + * another list. */ -void exit_tasks_rcu_start(void) __acquires(&tasks_rcu_exit_srcu) +void exit_tasks_rcu_start(void) { - current->rcu_tasks_idx = __srcu_read_lock(&tasks_rcu_exit_srcu); + unsigned long flags; + struct rcu_tasks_percpu *rtpcp; + struct task_struct *t = current; + + WARN_ON_ONCE(!list_empty(&t->rcu_tasks_exit_list)); + get_task_struct(t); + preempt_disable(); + rtpcp = this_cpu_ptr(rcu_tasks.rtpcpu); + t->rcu_tasks_exit_cpu = smp_processor_id(); + raw_spin_lock_irqsave_rcu_node(rtpcp, flags); + if (!rtpcp->rtp_exit_list.next) + INIT_LIST_HEAD(&rtpcp->rtp_exit_list); + list_add(&t->rcu_tasks_exit_list, &rtpcp->rtp_exit_list); + raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); + preempt_enable(); } /* - * Contribute to protect against tasklist scan blind spot while the - * task is exiting and may be removed from the tasklist. See - * corresponding synchronize_srcu() for further details. + * Remove the task from the "yet another list" because do_exit() is now + * non-preemptible, allowing synchronize_rcu() to wait beyond this point. */ -void exit_tasks_rcu_stop(void) __releases(&tasks_rcu_exit_srcu) +void exit_tasks_rcu_stop(void) { + unsigned long flags; + struct rcu_tasks_percpu *rtpcp; struct task_struct *t = current; - __srcu_read_unlock(&tasks_rcu_exit_srcu, t->rcu_tasks_idx); + WARN_ON_ONCE(list_empty(&t->rcu_tasks_exit_list)); + rtpcp = per_cpu_ptr(rcu_tasks.rtpcpu, t->rcu_tasks_exit_cpu); + raw_spin_lock_irqsave_rcu_node(rtpcp, flags); + list_del_init(&t->rcu_tasks_exit_list); + raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); + put_task_struct(t); } /* From c931eafc5a82082e3b5ff429024af07f4886a8de Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Mon, 22 Jan 2024 09:02:47 -0500 Subject: [PATCH 225/707] Bluetooth: hci_event: Fix handling of HCI_EV_IO_CAPA_REQUEST If we received HCI_EV_IO_CAPA_REQUEST while HCI_OP_READ_REMOTE_EXT_FEATURES is yet to be responded assume the remote does support SSP since otherwise this event shouldn't be generated. Link: https://lore.kernel.org/linux-bluetooth/CABBYNZ+9UdG1cMZVmdtN3U2aS16AKMCyTARZZyFX7xTEDWcMOw@mail.gmail.com/T/#t Fixes: c7f59461f5a7 ("Bluetooth: Fix a refcnt underflow problem for hci_conn") Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_event.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 6130c969f361a7..a15924db83d9fe 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -5327,9 +5327,12 @@ static void hci_io_capa_request_evt(struct hci_dev *hdev, void *data, hci_dev_lock(hdev); conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &ev->bdaddr); - if (!conn || !hci_conn_ssp_enabled(conn)) + if (!conn || !hci_dev_test_flag(hdev, HCI_SSP_ENABLED)) goto unlock; + /* Assume remote supports SSP since it has triggered this event */ + set_bit(HCI_CONN_SSP_ENABLED, &conn->flags); + hci_conn_hold(conn); if (!hci_dev_test_flag(hdev, HCI_MGMT)) From 9434e62334d5030e442c2be1fd96aad8816052a5 Mon Sep 17 00:00:00 2001 From: Kai-Heng Feng Date: Thu, 25 Jan 2024 14:50:28 +0800 Subject: [PATCH 226/707] Bluetooth: Enforce validation on max value of connection interval Right now Linux BT stack cannot pass test case "GAP/CONN/CPUP/BV-05-C 'Connection Parameter Update Procedure Invalid Parameters Central Responder'" in Bluetooth Test Suite revision GAP.TS.p44. [0] That was revoled by commit c49a8682fc5d ("Bluetooth: validate BLE connection interval updates"), but later got reverted due to devices like keyboards and mice may require low connection interval. So only validate the max value connection interval to pass the Test Suite, and let devices to request low connection interval if needed. [0] https://www.bluetooth.org/docman/handlers/DownloadDoc.ashx?doc_id=229869 Fixes: 68d19d7d9957 ("Revert "Bluetooth: validate BLE connection interval updates"") Signed-off-by: Kai-Heng Feng Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_event.c | 4 ++++ net/bluetooth/l2cap_core.c | 8 +++++++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index a15924db83d9fe..31df5f5b799455 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -6795,6 +6795,10 @@ static void hci_le_remote_conn_param_req_evt(struct hci_dev *hdev, void *data, return send_conn_param_neg_reply(hdev, handle, HCI_ERROR_UNKNOWN_CONN_ID); + if (max > hcon->le_conn_max_interval) + return send_conn_param_neg_reply(hdev, handle, + HCI_ERROR_INVALID_LL_PARAMS); + if (hci_check_conn_params(min, max, latency, timeout)) return send_conn_param_neg_reply(hdev, handle, HCI_ERROR_INVALID_LL_PARAMS); diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 60298975d5c456..656f49b299d20d 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -5613,7 +5613,13 @@ static inline int l2cap_conn_param_update_req(struct l2cap_conn *conn, memset(&rsp, 0, sizeof(rsp)); - err = hci_check_conn_params(min, max, latency, to_multiplier); + if (max > hcon->le_conn_max_interval) { + BT_DBG("requested connection interval exceeds current bounds."); + err = -EINVAL; + } else { + err = hci_check_conn_params(min, max, latency, to_multiplier); + } + if (err) rsp.result = cpu_to_le16(L2CAP_CONN_PARAM_REJECTED); else From 693a94db9e8cff14cce892cba6818bc67ab51ec4 Mon Sep 17 00:00:00 2001 From: Edward Adam Davis Date: Thu, 18 Jan 2024 12:40:34 +0800 Subject: [PATCH 227/707] Bluetooth: btintel: Fix null ptr deref in btintel_read_version If hci_cmd_sync_complete() is triggered and skb is NULL, then hdev->req_skb is NULL, which will cause this issue. Reported-and-tested-by: syzbot+830d9e3fa61968246abd@syzkaller.appspotmail.com Signed-off-by: Edward Adam Davis Signed-off-by: Luiz Augusto von Dentz --- drivers/bluetooth/btintel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/bluetooth/btintel.c b/drivers/bluetooth/btintel.c index cdc5c08824a0ad..e5b043d9620730 100644 --- a/drivers/bluetooth/btintel.c +++ b/drivers/bluetooth/btintel.c @@ -435,7 +435,7 @@ int btintel_read_version(struct hci_dev *hdev, struct intel_version *ver) struct sk_buff *skb; skb = __hci_cmd_sync(hdev, 0xfc05, 0, NULL, HCI_CMD_TIMEOUT); - if (IS_ERR(skb)) { + if (IS_ERR_OR_NULL(skb)) { bt_dev_err(hdev, "Reading Intel version information failed (%ld)", PTR_ERR(skb)); return PTR_ERR(skb); From 64692e12507b3efd71b4ff5596c9742d91f1ffe5 Mon Sep 17 00:00:00 2001 From: Zijun Hu Date: Fri, 19 Jan 2024 17:45:30 +0800 Subject: [PATCH 228/707] Bluetooth: qca: Fix wrong event type for patch config command Vendor-specific command patch config has HCI_Command_Complete event as response, but qca_send_patch_config_cmd() wrongly expects vendor-specific event for the command, fixed by using right event type. Btmon log for the vendor-specific command are shown below: < HCI Command: Vendor (0x3f|0x0000) plen 5 28 01 00 00 00 > HCI Event: Command Complete (0x0e) plen 5 Vendor (0x3f|0x0000) ncmd 1 Status: Success (0x00) 28 Fixes: 4fac8a7ac80b ("Bluetooth: btqca: sequential validation") Signed-off-by: Zijun Hu Signed-off-by: Luiz Augusto von Dentz --- drivers/bluetooth/btqca.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/bluetooth/btqca.c b/drivers/bluetooth/btqca.c index fdb0fae88d1c58..b40b32fa7f1c38 100644 --- a/drivers/bluetooth/btqca.c +++ b/drivers/bluetooth/btqca.c @@ -152,7 +152,7 @@ static int qca_send_patch_config_cmd(struct hci_dev *hdev) bt_dev_dbg(hdev, "QCA Patch config"); skb = __hci_cmd_sync_ev(hdev, EDL_PATCH_CMD_OPCODE, sizeof(cmd), - cmd, HCI_EV_VENDOR, HCI_INIT_TIMEOUT); + cmd, 0, HCI_INIT_TIMEOUT); if (IS_ERR(skb)) { err = PTR_ERR(skb); bt_dev_err(hdev, "Sending QCA Patch config failed (%d)", err); From fb09ad63d798533d4a3a338513c1ef1a547955f0 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 12 Jan 2024 09:11:13 -0800 Subject: [PATCH 229/707] f2fs: remove unnecessary f2fs_put_page in f2fs_rename [1] changed the below condition, which made f2fs_put_page() voided. This patch reapplies the AL's resolution in -next from [2]. - if (S_ISDIR(old_inode->i_mode)) { + if (old_is_dir && old_dir != new_dir) { old_dir_entry = f2fs_parent_dir(old_inode, &old_dir_page); if (!old_dir_entry) { if (IS_ERR(old_dir_page)) [1] 7deee77b993a ("f2fs: Avoid reading renamed directory if parent does not change") [2] https://lore.kernel.org/all/20231220013402.GW1674809@ZenIV/ Suggested-by: Al Viro Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/namei.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index b3bb815fc6aa45..ba11298b78379a 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -1105,14 +1105,11 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir, iput(whiteout); } - if (old_is_dir) { - if (old_dir_entry) - f2fs_set_link(old_inode, old_dir_entry, - old_dir_page, new_dir); - else - f2fs_put_page(old_dir_page, 0); + if (old_dir_entry) + f2fs_set_link(old_inode, old_dir_entry, old_dir_page, new_dir); + if (old_is_dir) f2fs_i_links_write(old_dir, false); - } + if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_STRICT) { f2fs_add_ino_entry(sbi, new_dir->i_ino, TRANS_DIR_INO); if (S_ISDIR(old_inode->i_mode)) From e2f29120ff1f25b3bd59310b0443ce7f049c5f0a Mon Sep 17 00:00:00 2001 From: Wu Bo Date: Thu, 28 Dec 2023 20:25:07 -0700 Subject: [PATCH 230/707] f2fs: check free sections before disable checkpoint 'f2fs_is_checkpoint_ready()' checks free sections. If there is not enough free sections, most f2fs operations will return -ENOSPC when checkpoint is disabled. It would be better to check free sections before disable checkpoint. Signed-off-by: Wu Bo Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 4c8836ded90fc2..f0f12b1eddc8b0 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -906,6 +906,8 @@ int f2fs_disable_cp_again(struct f2fs_sb_info *sbi, block_t unusable) if (is_sbi_flag_set(sbi, SBI_CP_DISABLED_QUICK) && dirty_segments(sbi) > ovp_hole_segs) return -EAGAIN; + if (has_not_enough_free_secs(sbi, 0, 0)) + return -EAGAIN; return 0; } From fe2b98bcae7e5120134a3c44051c51fade60fa15 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 13 Jan 2024 03:41:27 +0800 Subject: [PATCH 231/707] f2fs: compress: fix to guarantee persisting compressed blocks by CP If data block in compressed cluster is not persisted with metadata during checkpoint, after SPOR, the data may be corrupted, let's guarantee to write compressed page by checkpoint. Fixes: 4c8ff7095bef ("f2fs: support data compression") Reviewed-by: Daeho Jeong Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 4 +++- fs/f2fs/data.c | 17 +++++++++-------- fs/f2fs/f2fs.h | 4 +++- 3 files changed, 15 insertions(+), 10 deletions(-) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 531517dac07967..3a8d8a213b4063 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -1418,6 +1418,8 @@ void f2fs_compress_write_end_io(struct bio *bio, struct page *page) struct f2fs_sb_info *sbi = bio->bi_private; struct compress_io_ctx *cic = (struct compress_io_ctx *)page_private(page); + enum count_type type = WB_DATA_TYPE(page, + f2fs_is_compressed_page(page)); int i; if (unlikely(bio->bi_status)) @@ -1425,7 +1427,7 @@ void f2fs_compress_write_end_io(struct bio *bio, struct page *page) f2fs_compress_free_page(page); - dec_page_count(sbi, F2FS_WB_DATA); + dec_page_count(sbi, type); if (atomic_dec_return(&cic->pending_pages)) return; diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 26e317696b3389..d00e92b6c90250 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -48,7 +48,7 @@ void f2fs_destroy_bioset(void) bioset_exit(&f2fs_bioset); } -static bool __is_cp_guaranteed(struct page *page) +bool f2fs_is_cp_guaranteed(struct page *page) { struct address_space *mapping = page->mapping; struct inode *inode; @@ -65,8 +65,6 @@ static bool __is_cp_guaranteed(struct page *page) S_ISDIR(inode->i_mode)) return true; - if (f2fs_is_compressed_page(page)) - return false; if ((S_ISREG(inode->i_mode) && IS_NOQUOTA(inode)) || page_private_gcing(page)) return true; @@ -338,7 +336,7 @@ static void f2fs_write_end_io(struct bio *bio) bio_for_each_segment_all(bvec, bio, iter_all) { struct page *page = bvec->bv_page; - enum count_type type = WB_DATA_TYPE(page); + enum count_type type = WB_DATA_TYPE(page, false); if (page_private_dummy(page)) { clear_page_private_dummy(page); @@ -762,7 +760,7 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) wbc_account_cgroup_owner(fio->io_wbc, fio->page, PAGE_SIZE); inc_page_count(fio->sbi, is_read_io(fio->op) ? - __read_io_type(page) : WB_DATA_TYPE(fio->page)); + __read_io_type(page) : WB_DATA_TYPE(fio->page, false)); if (is_read_io(bio_op(bio))) f2fs_submit_read_bio(fio->sbi, bio, fio->type); @@ -973,7 +971,7 @@ int f2fs_merge_page_bio(struct f2fs_io_info *fio) if (fio->io_wbc) wbc_account_cgroup_owner(fio->io_wbc, fio->page, PAGE_SIZE); - inc_page_count(fio->sbi, WB_DATA_TYPE(page)); + inc_page_count(fio->sbi, WB_DATA_TYPE(page, false)); *fio->last_block = fio->new_blkaddr; *fio->bio = bio; @@ -1007,6 +1005,7 @@ void f2fs_submit_page_write(struct f2fs_io_info *fio) enum page_type btype = PAGE_TYPE_OF_BIO(fio->type); struct f2fs_bio_info *io = sbi->write_io[btype] + fio->temp; struct page *bio_page; + enum count_type type; f2fs_bug_on(sbi, is_read_io(fio->op)); @@ -1046,7 +1045,8 @@ void f2fs_submit_page_write(struct f2fs_io_info *fio) /* set submitted = true as a return value */ fio->submitted = 1; - inc_page_count(sbi, WB_DATA_TYPE(bio_page)); + type = WB_DATA_TYPE(bio_page, fio->compressed_page); + inc_page_count(sbi, type); if (io->bio && (!io_is_mergeable(sbi, io->bio, io, fio, io->last_block_in_bio, @@ -1059,7 +1059,8 @@ void f2fs_submit_page_write(struct f2fs_io_info *fio) if (F2FS_IO_ALIGNED(sbi) && (fio->type == DATA || fio->type == NODE) && fio->new_blkaddr & F2FS_IO_SIZE_MASK(sbi)) { - dec_page_count(sbi, WB_DATA_TYPE(bio_page)); + dec_page_count(sbi, WB_DATA_TYPE(bio_page, + fio->compressed_page)); fio->retry = 1; goto skip; } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 65294e3b0bef88..50f3d546ded858 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1080,7 +1080,8 @@ struct f2fs_sm_info { * f2fs monitors the number of several block types such as on-writeback, * dirty dentry blocks, dirty node blocks, and dirty meta blocks. */ -#define WB_DATA_TYPE(p) (__is_cp_guaranteed(p) ? F2FS_WB_CP_DATA : F2FS_WB_DATA) +#define WB_DATA_TYPE(p, f) \ + (f || f2fs_is_cp_guaranteed(p) ? F2FS_WB_CP_DATA : F2FS_WB_DATA) enum count_type { F2FS_DIRTY_DENTS, F2FS_DIRTY_DATA, @@ -3804,6 +3805,7 @@ void f2fs_init_ckpt_req_control(struct f2fs_sb_info *sbi); */ int __init f2fs_init_bioset(void); void f2fs_destroy_bioset(void); +bool f2fs_is_cp_guaranteed(struct page *page); int f2fs_init_bio_entry_cache(void); void f2fs_destroy_bio_entry_cache(void); void f2fs_submit_read_bio(struct f2fs_sb_info *sbi, struct bio *bio, From bd0dddafd4a51e15349f6dfbaf0f052be3334f25 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 13 Jan 2024 03:41:28 +0800 Subject: [PATCH 232/707] f2fs: compress: fix to cover normal cluster write with cp_rwsem When we overwrite compressed cluster w/ normal cluster, we should not unlock cp_rwsem during f2fs_write_raw_pages(), otherwise data will be corrupted if partial blocks were persisted before CP & SPOR, due to cluster metadata wasn't updated atomically. Fixes: 4c8ff7095bef ("f2fs: support data compression") Reviewed-by: Daeho Jeong Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 27 ++++++++++++++++++--------- fs/f2fs/data.c | 3 ++- 2 files changed, 20 insertions(+), 10 deletions(-) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 3a8d8a213b4063..ff26b49c0d71ff 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -1443,12 +1443,14 @@ void f2fs_compress_write_end_io(struct bio *bio, struct page *page) } static int f2fs_write_raw_pages(struct compress_ctx *cc, - int *submitted, + int *submitted_p, struct writeback_control *wbc, enum iostat_type io_type) { struct address_space *mapping = cc->inode->i_mapping; - int _submitted, compr_blocks, ret, i; + struct f2fs_sb_info *sbi = F2FS_M_SB(mapping); + int submitted, compr_blocks, i; + int ret = 0; compr_blocks = f2fs_compressed_blocks(cc); @@ -1463,6 +1465,10 @@ static int f2fs_write_raw_pages(struct compress_ctx *cc, if (compr_blocks < 0) return compr_blocks; + /* overwrite compressed cluster w/ normal cluster */ + if (compr_blocks > 0) + f2fs_lock_op(sbi); + for (i = 0; i < cc->cluster_size; i++) { if (!cc->rpages[i]) continue; @@ -1487,7 +1493,7 @@ static int f2fs_write_raw_pages(struct compress_ctx *cc, if (!clear_page_dirty_for_io(cc->rpages[i])) goto continue_unlock; - ret = f2fs_write_single_data_page(cc->rpages[i], &_submitted, + ret = f2fs_write_single_data_page(cc->rpages[i], &submitted, NULL, NULL, wbc, io_type, compr_blocks, false); if (ret) { @@ -1495,26 +1501,29 @@ static int f2fs_write_raw_pages(struct compress_ctx *cc, unlock_page(cc->rpages[i]); ret = 0; } else if (ret == -EAGAIN) { + ret = 0; /* * for quota file, just redirty left pages to * avoid deadlock caused by cluster update race * from foreground operation. */ if (IS_NOQUOTA(cc->inode)) - return 0; - ret = 0; + goto out; f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT); goto retry_write; } - return ret; + goto out; } - *submitted += _submitted; + *submitted_p += submitted; } - f2fs_balance_fs(F2FS_M_SB(mapping), true); +out: + if (compr_blocks > 0) + f2fs_unlock_op(sbi); - return 0; + f2fs_balance_fs(sbi, true); + return ret; } int f2fs_write_multi_pages(struct compress_ctx *cc, diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index d00e92b6c90250..7a93a99fbd04be 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2839,7 +2839,7 @@ int f2fs_write_single_data_page(struct page *page, int *submitted, .encrypted_page = NULL, .submitted = 0, .compr_blocks = compr_blocks, - .need_lock = LOCK_RETRY, + .need_lock = compr_blocks ? LOCK_DONE : LOCK_RETRY, .post_read = f2fs_post_read_required(inode) ? 1 : 0, .io_type = io_type, .io_wbc = wbc, @@ -2920,6 +2920,7 @@ int f2fs_write_single_data_page(struct page *page, int *submitted, if (err == -EAGAIN) { err = f2fs_do_write_data_page(&fio); if (err == -EAGAIN) { + f2fs_bug_on(sbi, compr_blocks); fio.need_lock = LOCK_REQ; err = f2fs_do_write_data_page(&fio); } From 787f217e7f04b175794390572a8f528cb1840be5 Mon Sep 17 00:00:00 2001 From: Sheng Yong Date: Sat, 13 Jan 2024 03:41:29 +0800 Subject: [PATCH 233/707] f2fs: compress: fix to check unreleased compressed cluster Compressed cluster may not be released due to we can fail in release_compress_blocks(), fix to handle reserved compressed cluster correctly in reserve_compress_blocks(). Fixes: 4c8ff7095bef ("f2fs: support data compression") Signed-off-by: Sheng Yong Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index b58ab1157b7ef2..941e02c0953ccf 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -3624,7 +3624,13 @@ static int reserve_compress_blocks(struct dnode_of_data *dn, pgoff_t count) goto next; } - if (__is_valid_data_blkaddr(blkaddr)) { + /* + * compressed cluster was not released due to it + * fails in release_compress_blocks(), so NEW_ADDR + * is a possible case. + */ + if (blkaddr == NEW_ADDR || + __is_valid_data_blkaddr(blkaddr)) { compr_blocks++; continue; } @@ -3633,6 +3639,11 @@ static int reserve_compress_blocks(struct dnode_of_data *dn, pgoff_t count) } reserved = cluster_size - compr_blocks; + + /* for the case all blocks in cluster were reserved */ + if (reserved == 1) + goto next; + ret = inc_valid_block_count(sbi, dn->inode, &reserved); if (ret) return ret; From 6d05c8d5997ac3543c87f0a8d9a1b30ec3cc860f Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 13 Jan 2024 03:41:30 +0800 Subject: [PATCH 234/707] f2fs: compress: fix to avoid inconsistence bewteen i_blocks and dnode In reserve_compress_blocks(), we update blkaddrs of dnode in prior to inc_valid_block_count(), it may cause inconsistent status bewteen i_blocks and blkaddrs once inc_valid_block_count() fails. To fix this issue, it needs to reverse their invoking order. Fixes: c75488fb4d82 ("f2fs: introduce F2FS_IOC_RESERVE_COMPRESS_BLOCKS") Reviewed-by: Daeho Jeong Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 5 +++-- fs/f2fs/f2fs.h | 7 ++++++- fs/f2fs/file.c | 26 ++++++++++++++------------ fs/f2fs/segment.c | 2 +- 4 files changed, 24 insertions(+), 16 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 7a93a99fbd04be..65fe48bb17d16b 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1219,7 +1219,8 @@ int f2fs_reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count) if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC))) return -EPERM; - if (unlikely((err = inc_valid_block_count(sbi, dn->inode, &count)))) + err = inc_valid_block_count(sbi, dn->inode, &count, true); + if (unlikely(err)) return err; trace_f2fs_reserve_new_blocks(dn->inode, dn->nid, @@ -1476,7 +1477,7 @@ static int __allocate_data_block(struct dnode_of_data *dn, int seg_type) dn->data_blkaddr = f2fs_data_blkaddr(dn); if (dn->data_blkaddr == NULL_ADDR) { - err = inc_valid_block_count(sbi, dn->inode, &count); + err = inc_valid_block_count(sbi, dn->inode, &count, true); if (unlikely(err)) return err; } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 50f3d546ded858..69e71460a9502d 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2252,7 +2252,7 @@ static inline bool __allow_reserved_blocks(struct f2fs_sb_info *sbi, static inline void f2fs_i_blocks_write(struct inode *, block_t, bool, bool); static inline int inc_valid_block_count(struct f2fs_sb_info *sbi, - struct inode *inode, blkcnt_t *count) + struct inode *inode, blkcnt_t *count, bool partial) { blkcnt_t diff = 0, release = 0; block_t avail_user_block_count; @@ -2292,6 +2292,11 @@ static inline int inc_valid_block_count(struct f2fs_sb_info *sbi, avail_user_block_count = 0; } if (unlikely(sbi->total_valid_block_count > avail_user_block_count)) { + if (!partial) { + spin_unlock(&sbi->stat_lock); + goto enospc; + } + diff = sbi->total_valid_block_count - avail_user_block_count; if (diff > *count) diff = *count; diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 941e02c0953ccf..1ff1c45e192711 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -3614,14 +3614,16 @@ static int reserve_compress_blocks(struct dnode_of_data *dn, pgoff_t count) blkcnt_t reserved; int ret; - for (i = 0; i < cluster_size; i++, dn->ofs_in_node++) { - blkaddr = f2fs_data_blkaddr(dn); + for (i = 0; i < cluster_size; i++) { + blkaddr = data_blkaddr(dn->inode, dn->node_page, + dn->ofs_in_node + i); if (i == 0) { - if (blkaddr == COMPRESS_ADDR) - continue; - dn->ofs_in_node += cluster_size; - goto next; + if (blkaddr != COMPRESS_ADDR) { + dn->ofs_in_node += cluster_size; + goto next; + } + continue; } /* @@ -3634,8 +3636,6 @@ static int reserve_compress_blocks(struct dnode_of_data *dn, pgoff_t count) compr_blocks++; continue; } - - f2fs_set_data_blkaddr(dn, NEW_ADDR); } reserved = cluster_size - compr_blocks; @@ -3644,12 +3644,14 @@ static int reserve_compress_blocks(struct dnode_of_data *dn, pgoff_t count) if (reserved == 1) goto next; - ret = inc_valid_block_count(sbi, dn->inode, &reserved); - if (ret) + ret = inc_valid_block_count(sbi, dn->inode, &reserved, false); + if (unlikely(ret)) return ret; - if (reserved != cluster_size - compr_blocks) - return -ENOSPC; + for (i = 0; i < cluster_size; i++, dn->ofs_in_node++) { + if (f2fs_data_blkaddr(dn) == NULL_ADDR) + f2fs_set_data_blkaddr(dn, NEW_ADDR); + } f2fs_i_compr_blocks_update(dn->inode, compr_blocks, true); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index f0f12b1eddc8b0..7901ede5811357 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -248,7 +248,7 @@ static int __replace_atomic_write_block(struct inode *inode, pgoff_t index, } else { blkcnt_t count = 1; - err = inc_valid_block_count(sbi, inode, &count); + err = inc_valid_block_count(sbi, inode, &count, true); if (err) { f2fs_put_dnode(&dn); return err; From ef62f2496f99fd736daef4622ec43175a360bd08 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 13 Jan 2024 03:41:31 +0800 Subject: [PATCH 235/707] f2fs: fix to remove unnecessary f2fs_bug_on() to avoid panic verify_blkaddr() will trigger panic once we inject fault into f2fs_is_valid_blkaddr(), fix to remove this unnecessary f2fs_bug_on(). Fixes: 18792e64c86d ("f2fs: support fault injection for f2fs_is_valid_blkaddr()") Reviewed-by: Daeho Jeong Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 69e71460a9502d..ab710bb6d8b32e 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3470,11 +3470,9 @@ bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi, static inline void verify_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type) { - if (!f2fs_is_valid_blkaddr(sbi, blkaddr, type)) { + if (!f2fs_is_valid_blkaddr(sbi, blkaddr, type)) f2fs_err(sbi, "invalid blkaddr: %u, type: %d, run fsck to fix.", blkaddr, type); - f2fs_bug_on(sbi, 1); - } } static inline bool __is_valid_data_blkaddr(block_t blkaddr) From e2c2cb1a331fc09cedcb8b395a6b70acf2c33815 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 13 Jan 2024 03:41:32 +0800 Subject: [PATCH 236/707] f2fs: introduce FAULT_BLKADDR_CONSISTENCE We will encounter below inconsistent status when FAULT_BLKADDR type fault injection is on. Info: checkpoint state = d6 : nat_bits crc fsck compacted_summary orphan_inodes sudden-power-off [ASSERT] (fsck_chk_inode_blk:1254) --> ino: 0x1c100 has i_blocks: 000000c0, but has 191 blocks [FIX] (fsck_chk_inode_blk:1260) --> [0x1c100] i_blocks=0x000000c0 -> 0xbf [FIX] (fsck_chk_inode_blk:1269) --> [0x1c100] i_compr_blocks=0x00000026 -> 0x27 [ASSERT] (fsck_chk_inode_blk:1254) --> ino: 0x1cadb has i_blocks: 0000002f, but has 46 blocks [FIX] (fsck_chk_inode_blk:1260) --> [0x1cadb] i_blocks=0x0000002f -> 0x2e [FIX] (fsck_chk_inode_blk:1269) --> [0x1cadb] i_compr_blocks=0x00000011 -> 0x12 [ASSERT] (fsck_chk_inode_blk:1254) --> ino: 0x1c62c has i_blocks: 00000002, but has 1 blocks [FIX] (fsck_chk_inode_blk:1260) --> [0x1c62c] i_blocks=0x00000002 -> 0x1 After we inject fault into f2fs_is_valid_blkaddr() during truncation, a) it missed to increase @nr_free or @valid_blocks b) it can cause in blkaddr leak in truncated dnode Which may cause inconsistent status. This patch separates FAULT_BLKADDR_CONSISTENCE from FAULT_BLKADDR, and rename FAULT_BLKADDR to FAULT_BLKADDR_VALIDITY so that we can: a) use FAULT_BLKADDR_CONSISTENCE in f2fs_truncate_data_blocks_range() to simulate inconsistent issue independently, then it can verify fsck repair flow. b) FAULT_BLKADDR_VALIDITY fault will not cause any inconsistent status, we can just use it to check error path handling in kernel side. Reviewed-by: Daeho Jeong Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 47 +++++++++++++------------ Documentation/filesystems/f2fs.rst | 47 +++++++++++++------------ fs/f2fs/checkpoint.c | 19 +++++++--- fs/f2fs/f2fs.h | 5 ++- fs/f2fs/file.c | 8 +++-- fs/f2fs/super.c | 37 +++++++++---------- 6 files changed, 92 insertions(+), 71 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index 99fa87a43926e6..48c135e24eb57d 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -701,29 +701,30 @@ Description: Support configuring fault injection type, should be enabled with fault_injection option, fault type value is shown below, it supports single or combined type. - =================== =========== - Type_Name Type_Value - =================== =========== - FAULT_KMALLOC 0x000000001 - FAULT_KVMALLOC 0x000000002 - FAULT_PAGE_ALLOC 0x000000004 - FAULT_PAGE_GET 0x000000008 - FAULT_ALLOC_BIO 0x000000010 (obsolete) - FAULT_ALLOC_NID 0x000000020 - FAULT_ORPHAN 0x000000040 - FAULT_BLOCK 0x000000080 - FAULT_DIR_DEPTH 0x000000100 - FAULT_EVICT_INODE 0x000000200 - FAULT_TRUNCATE 0x000000400 - FAULT_READ_IO 0x000000800 - FAULT_CHECKPOINT 0x000001000 - FAULT_DISCARD 0x000002000 - FAULT_WRITE_IO 0x000004000 - FAULT_SLAB_ALLOC 0x000008000 - FAULT_DQUOT_INIT 0x000010000 - FAULT_LOCK_OP 0x000020000 - FAULT_BLKADDR 0x000040000 - =================== =========== + =========================== =========== + Type_Name Type_Value + =========================== =========== + FAULT_KMALLOC 0x000000001 + FAULT_KVMALLOC 0x000000002 + FAULT_PAGE_ALLOC 0x000000004 + FAULT_PAGE_GET 0x000000008 + FAULT_ALLOC_BIO 0x000000010 (obsolete) + FAULT_ALLOC_NID 0x000000020 + FAULT_ORPHAN 0x000000040 + FAULT_BLOCK 0x000000080 + FAULT_DIR_DEPTH 0x000000100 + FAULT_EVICT_INODE 0x000000200 + FAULT_TRUNCATE 0x000000400 + FAULT_READ_IO 0x000000800 + FAULT_CHECKPOINT 0x000001000 + FAULT_DISCARD 0x000002000 + FAULT_WRITE_IO 0x000004000 + FAULT_SLAB_ALLOC 0x000008000 + FAULT_DQUOT_INIT 0x000010000 + FAULT_LOCK_OP 0x000020000 + FAULT_BLKADDR_VALIDITY 0x000040000 + FAULT_BLKADDR_CONSISTENCE 0x000080000 + =========================== =========== What: /sys/fs/f2fs//discard_io_aware_gran Date: January 2023 diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst index d32c6209685d64..32cbfa864f389b 100644 --- a/Documentation/filesystems/f2fs.rst +++ b/Documentation/filesystems/f2fs.rst @@ -184,29 +184,30 @@ fault_type=%d Support configuring fault injection type, should be enabled with fault_injection option, fault type value is shown below, it supports single or combined type. - =================== =========== - Type_Name Type_Value - =================== =========== - FAULT_KMALLOC 0x000000001 - FAULT_KVMALLOC 0x000000002 - FAULT_PAGE_ALLOC 0x000000004 - FAULT_PAGE_GET 0x000000008 - FAULT_ALLOC_BIO 0x000000010 (obsolete) - FAULT_ALLOC_NID 0x000000020 - FAULT_ORPHAN 0x000000040 - FAULT_BLOCK 0x000000080 - FAULT_DIR_DEPTH 0x000000100 - FAULT_EVICT_INODE 0x000000200 - FAULT_TRUNCATE 0x000000400 - FAULT_READ_IO 0x000000800 - FAULT_CHECKPOINT 0x000001000 - FAULT_DISCARD 0x000002000 - FAULT_WRITE_IO 0x000004000 - FAULT_SLAB_ALLOC 0x000008000 - FAULT_DQUOT_INIT 0x000010000 - FAULT_LOCK_OP 0x000020000 - FAULT_BLKADDR 0x000040000 - =================== =========== + =========================== =========== + Type_Name Type_Value + =========================== =========== + FAULT_KMALLOC 0x000000001 + FAULT_KVMALLOC 0x000000002 + FAULT_PAGE_ALLOC 0x000000004 + FAULT_PAGE_GET 0x000000008 + FAULT_ALLOC_BIO 0x000000010 (obsolete) + FAULT_ALLOC_NID 0x000000020 + FAULT_ORPHAN 0x000000040 + FAULT_BLOCK 0x000000080 + FAULT_DIR_DEPTH 0x000000100 + FAULT_EVICT_INODE 0x000000200 + FAULT_TRUNCATE 0x000000400 + FAULT_READ_IO 0x000000800 + FAULT_CHECKPOINT 0x000001000 + FAULT_DISCARD 0x000002000 + FAULT_WRITE_IO 0x000004000 + FAULT_SLAB_ALLOC 0x000008000 + FAULT_DQUOT_INIT 0x000010000 + FAULT_LOCK_OP 0x000020000 + FAULT_BLKADDR_VALIDITY 0x000040000 + FAULT_BLKADDR_CONSISTENCE 0x000080000 + =========================== =========== mode=%s Control block allocation mode which supports "adaptive" and "lfs". In "lfs" mode, there should be no random writes towards main area. diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index b0597a539fc548..b85820e70f5e44 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -170,12 +170,9 @@ static bool __is_bitmap_valid(struct f2fs_sb_info *sbi, block_t blkaddr, return exist; } -bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi, +static bool __f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type) { - if (time_to_inject(sbi, FAULT_BLKADDR)) - return false; - switch (type) { case META_NAT: break; @@ -230,6 +227,20 @@ bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi, return true; } +bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi, + block_t blkaddr, int type) +{ + if (time_to_inject(sbi, FAULT_BLKADDR_VALIDITY)) + return false; + return __f2fs_is_valid_blkaddr(sbi, blkaddr, type); +} + +bool f2fs_is_valid_blkaddr_raw(struct f2fs_sb_info *sbi, + block_t blkaddr, int type) +{ + return __f2fs_is_valid_blkaddr(sbi, blkaddr, type); +} + /* * Readahead CP/NAT/SIT/SSA/POR pages */ diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index ab710bb6d8b32e..4481f68d64181c 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -60,7 +60,8 @@ enum { FAULT_SLAB_ALLOC, FAULT_DQUOT_INIT, FAULT_LOCK_OP, - FAULT_BLKADDR, + FAULT_BLKADDR_VALIDITY, + FAULT_BLKADDR_CONSISTENCE, FAULT_MAX, }; @@ -3768,6 +3769,8 @@ struct page *f2fs_get_meta_page_retry(struct f2fs_sb_info *sbi, pgoff_t index); struct page *f2fs_get_tmp_page(struct f2fs_sb_info *sbi, pgoff_t index); bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type); +bool f2fs_is_valid_blkaddr_raw(struct f2fs_sb_info *sbi, + block_t blkaddr, int type); int f2fs_ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type, bool sync); void f2fs_ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index, diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 1ff1c45e192711..25b119cf3499d7 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -590,9 +590,13 @@ void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count) f2fs_set_data_blkaddr(dn, NULL_ADDR); if (__is_valid_data_blkaddr(blkaddr)) { - if (!f2fs_is_valid_blkaddr(sbi, blkaddr, - DATA_GENERIC_ENHANCE)) + if (time_to_inject(sbi, FAULT_BLKADDR_CONSISTENCE)) + continue; + if (!f2fs_is_valid_blkaddr_raw(sbi, blkaddr, + DATA_GENERIC_ENHANCE)) { + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); continue; + } if (compressed_cluster) valid_blocks++; } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index d45ab0992ae594..e2c066fbc0fa11 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -44,24 +44,25 @@ static struct kmem_cache *f2fs_inode_cachep; #ifdef CONFIG_F2FS_FAULT_INJECTION const char *f2fs_fault_name[FAULT_MAX] = { - [FAULT_KMALLOC] = "kmalloc", - [FAULT_KVMALLOC] = "kvmalloc", - [FAULT_PAGE_ALLOC] = "page alloc", - [FAULT_PAGE_GET] = "page get", - [FAULT_ALLOC_NID] = "alloc nid", - [FAULT_ORPHAN] = "orphan", - [FAULT_BLOCK] = "no more block", - [FAULT_DIR_DEPTH] = "too big dir depth", - [FAULT_EVICT_INODE] = "evict_inode fail", - [FAULT_TRUNCATE] = "truncate fail", - [FAULT_READ_IO] = "read IO error", - [FAULT_CHECKPOINT] = "checkpoint error", - [FAULT_DISCARD] = "discard error", - [FAULT_WRITE_IO] = "write IO error", - [FAULT_SLAB_ALLOC] = "slab alloc", - [FAULT_DQUOT_INIT] = "dquot initialize", - [FAULT_LOCK_OP] = "lock_op", - [FAULT_BLKADDR] = "invalid blkaddr", + [FAULT_KMALLOC] = "kmalloc", + [FAULT_KVMALLOC] = "kvmalloc", + [FAULT_PAGE_ALLOC] = "page alloc", + [FAULT_PAGE_GET] = "page get", + [FAULT_ALLOC_NID] = "alloc nid", + [FAULT_ORPHAN] = "orphan", + [FAULT_BLOCK] = "no more block", + [FAULT_DIR_DEPTH] = "too big dir depth", + [FAULT_EVICT_INODE] = "evict_inode fail", + [FAULT_TRUNCATE] = "truncate fail", + [FAULT_READ_IO] = "read IO error", + [FAULT_CHECKPOINT] = "checkpoint error", + [FAULT_DISCARD] = "discard error", + [FAULT_WRITE_IO] = "write IO error", + [FAULT_SLAB_ALLOC] = "slab alloc", + [FAULT_DQUOT_INIT] = "dquot initialize", + [FAULT_LOCK_OP] = "lock_op", + [FAULT_BLKADDR_VALIDITY] = "invalid blkaddr", + [FAULT_BLKADDR_CONSISTENCE] = "inconsistent blkaddr", }; void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate, From 38c9ce091a4bd0ff272438131424e98ea0e3906d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Draszik?= Date: Fri, 26 Jan 2024 11:55:16 +0000 Subject: [PATCH 237/707] dt-bindings: samsung: exynos-sysreg: gs101-peric0 requires a clock MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ... otherwise it won't be accessible. Update the schema to make this obvious. Signed-off-by: André Draszik Link: https://lore.kernel.org/r/20240126115517.1751971-1-andre.draszik@linaro.org Signed-off-by: Krzysztof Kozlowski --- .../devicetree/bindings/soc/samsung/samsung,exynos-sysreg.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/soc/samsung/samsung,exynos-sysreg.yaml b/Documentation/devicetree/bindings/soc/samsung/samsung,exynos-sysreg.yaml index 1794e3799f2110..33d837ae4f4577 100644 --- a/Documentation/devicetree/bindings/soc/samsung/samsung,exynos-sysreg.yaml +++ b/Documentation/devicetree/bindings/soc/samsung/samsung,exynos-sysreg.yaml @@ -72,6 +72,7 @@ allOf: compatible: contains: enum: + - google,gs101-peric0-sysreg - samsung,exynos850-cmgp-sysreg - samsung,exynos850-peri-sysreg - samsung,exynos850-sysreg From 34b2321cc648a246d08cc51e423532eac690ccf1 Mon Sep 17 00:00:00 2001 From: Andreas Larsson Date: Mon, 15 Jan 2024 16:02:00 +0100 Subject: [PATCH 238/707] MAINTAINERS: Add Andreas Larsson as co-maintainer for arch/sparc Dave has not been very active on arch/sparc for the past two years. I have been contributing to the SPARC32 port as well as maintaining out-of-tree SPARC32 patches for LEON3/4/5 (SPARCv8 with CAS support) since 2012. I am willing to step up as an arch/sparc (co-)maintainer. For recent discussions on the matter, see [1] and [2]. [1] https://lore.kernel.org/r/20230713075235.2164609-1-u.kleine-koenig@pengutronix.de [2] https://lore.kernel.org/r/20231209105816.GA1085691@ravnborg.org/ Signed-off-by: Andreas Larsson Suggested-by: Arnd Bergmann Acked-by: Sam Ravnborg Acked-by: John Paul Adrian Glaubitz Acked-by: Arnd Bergmann Acked-by: Jose E. Marchesi Signed-off-by: Arnd Bergmann --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 8d1052fa6a6924..542ab762be7de4 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -20549,6 +20549,7 @@ F: Documentation/translations/sp_SP/ SPARC + UltraSPARC (sparc/sparc64) M: "David S. Miller" +M: Andreas Larsson L: sparclinux@vger.kernel.org S: Maintained Q: http://patchwork.ozlabs.org/project/sparclinux/list/ From 34b82a2fb7475aba5adfd3ae9f2c66da7f1979f7 Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Fri, 26 Jan 2024 07:28:53 -0800 Subject: [PATCH 239/707] lkdtm/bugs: In lkdtm_HUNG_TASK() use BUG(), not BUG_ON(1) In commit edb6538da3df ("lkdtm/bugs: Adjust lkdtm_HUNG_TASK() to avoid tail call optimization") we marked lkdtm_HUNG_TASK() as __noreturn. The compiler gets unhappy if it thinks a __noreturn function might return, so there's a BUG_ON(1) at the end. Any human can see that the function won't return and the compiler can figure that out too. Except when it can't. The MIPS architecture defines HAVE_ARCH_BUG_ON and defines its own version of BUG_ON(). The MIPS version of BUG_ON() is not a macro but is instead an inline function. Apparently this prevents the compiler from realizing that the condition to BUG_ON() is constant and that the function will never return. Let's change the BUG_ON(1) to just BUG(), which it should have been to begin with. The only reason I used BUG_ON(1) to begin with was because I was used to using WARN_ON(1) when writing test code and WARN() and BUG() are oddly inconsistent in this manner. :-/ Fixes: edb6538da3df ("lkdtm/bugs: Adjust lkdtm_HUNG_TASK() to avoid tail call optimization") Signed-off-by: Douglas Anderson Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202401262204.wUFKRYZF-lkp@intel.com/ Acked-by: Arnd Bergmann Link: https://lore.kernel.org/r/20240126072852.1.Ib065e528a8620474a72f15baa2feead1f3d89865@changeid Signed-off-by: Kees Cook --- drivers/misc/lkdtm/bugs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/misc/lkdtm/bugs.c b/drivers/misc/lkdtm/bugs.c index d1222d3eda2f19..b92767d6bdd244 100644 --- a/drivers/misc/lkdtm/bugs.c +++ b/drivers/misc/lkdtm/bugs.c @@ -298,7 +298,7 @@ static void __noreturn lkdtm_HUNG_TASK(void) { set_current_state(TASK_UNINTERRUPTIBLE); schedule(); - BUG_ON(1); + BUG(); } static volatile unsigned int huge = INT_MAX - 2; From 3ef826924303470507ccd6e84d34c901c1ee34da Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 16 Jan 2024 14:14:59 -0700 Subject: [PATCH 240/707] block: move cgroup time handling code into blk.h In preparation for moving time keeping into blk.h, move the cgroup related code for timestamps in here too. This will help avoid a circular dependency, and also moves it into a more appropriate header as this one is private to the block layer code. Leave struct bio_issue in blk_types.h as it's a proper time definition. Signed-off-by: Jens Axboe --- block/blk-cgroup.h | 1 + block/blk.h | 42 +++++++++++++++++++++++++++++++++++++++ include/linux/blk_types.h | 42 --------------------------------------- 3 files changed, 43 insertions(+), 42 deletions(-) diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index b927a4a0ad0301..78b74106bf10c5 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -19,6 +19,7 @@ #include #include #include +#include "blk.h" struct blkcg_gq; struct blkg_policy_data; diff --git a/block/blk.h b/block/blk.h index 1ef920f72e0f87..620e3a035da116 100644 --- a/block/blk.h +++ b/block/blk.h @@ -516,4 +516,46 @@ static inline int req_ref_read(struct request *req) return atomic_read(&req->ref); } +/* + * From most significant bit: + * 1 bit: reserved for other usage, see below + * 12 bits: original size of bio + * 51 bits: issue time of bio + */ +#define BIO_ISSUE_RES_BITS 1 +#define BIO_ISSUE_SIZE_BITS 12 +#define BIO_ISSUE_RES_SHIFT (64 - BIO_ISSUE_RES_BITS) +#define BIO_ISSUE_SIZE_SHIFT (BIO_ISSUE_RES_SHIFT - BIO_ISSUE_SIZE_BITS) +#define BIO_ISSUE_TIME_MASK ((1ULL << BIO_ISSUE_SIZE_SHIFT) - 1) +#define BIO_ISSUE_SIZE_MASK \ + (((1ULL << BIO_ISSUE_SIZE_BITS) - 1) << BIO_ISSUE_SIZE_SHIFT) +#define BIO_ISSUE_RES_MASK (~((1ULL << BIO_ISSUE_RES_SHIFT) - 1)) + +/* Reserved bit for blk-throtl */ +#define BIO_ISSUE_THROTL_SKIP_LATENCY (1ULL << 63) + +static inline u64 __bio_issue_time(u64 time) +{ + return time & BIO_ISSUE_TIME_MASK; +} + +static inline u64 bio_issue_time(struct bio_issue *issue) +{ + return __bio_issue_time(issue->value); +} + +static inline sector_t bio_issue_size(struct bio_issue *issue) +{ + return ((issue->value & BIO_ISSUE_SIZE_MASK) >> BIO_ISSUE_SIZE_SHIFT); +} + +static inline void bio_issue_init(struct bio_issue *issue, + sector_t size) +{ + size &= (1ULL << BIO_ISSUE_SIZE_BITS) - 1; + issue->value = ((issue->value & BIO_ISSUE_RES_MASK) | + (ktime_get_ns() & BIO_ISSUE_TIME_MASK) | + ((u64)size << BIO_ISSUE_SIZE_SHIFT)); +} + #endif /* BLK_INTERNAL_H */ diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index f288c94374b307..1c07848dea7ec0 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -206,52 +206,10 @@ static inline bool blk_path_error(blk_status_t error) return true; } -/* - * From most significant bit: - * 1 bit: reserved for other usage, see below - * 12 bits: original size of bio - * 51 bits: issue time of bio - */ -#define BIO_ISSUE_RES_BITS 1 -#define BIO_ISSUE_SIZE_BITS 12 -#define BIO_ISSUE_RES_SHIFT (64 - BIO_ISSUE_RES_BITS) -#define BIO_ISSUE_SIZE_SHIFT (BIO_ISSUE_RES_SHIFT - BIO_ISSUE_SIZE_BITS) -#define BIO_ISSUE_TIME_MASK ((1ULL << BIO_ISSUE_SIZE_SHIFT) - 1) -#define BIO_ISSUE_SIZE_MASK \ - (((1ULL << BIO_ISSUE_SIZE_BITS) - 1) << BIO_ISSUE_SIZE_SHIFT) -#define BIO_ISSUE_RES_MASK (~((1ULL << BIO_ISSUE_RES_SHIFT) - 1)) - -/* Reserved bit for blk-throtl */ -#define BIO_ISSUE_THROTL_SKIP_LATENCY (1ULL << 63) - struct bio_issue { u64 value; }; -static inline u64 __bio_issue_time(u64 time) -{ - return time & BIO_ISSUE_TIME_MASK; -} - -static inline u64 bio_issue_time(struct bio_issue *issue) -{ - return __bio_issue_time(issue->value); -} - -static inline sector_t bio_issue_size(struct bio_issue *issue) -{ - return ((issue->value & BIO_ISSUE_SIZE_MASK) >> BIO_ISSUE_SIZE_SHIFT); -} - -static inline void bio_issue_init(struct bio_issue *issue, - sector_t size) -{ - size &= (1ULL << BIO_ISSUE_SIZE_BITS) - 1; - issue->value = ((issue->value & BIO_ISSUE_RES_MASK) | - (ktime_get_ns() & BIO_ISSUE_TIME_MASK) | - ((u64)size << BIO_ISSUE_SIZE_SHIFT)); -} - typedef __u32 __bitwise blk_opf_t; typedef unsigned int blk_qc_t; From 29bb740b66cc15175a119cd6fc59076580390116 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 15 Jan 2024 14:45:07 -0700 Subject: [PATCH 241/707] block: add blk_time_get_ns() and blk_time_get() helpers Convert any user of ktime_get_ns() to use blk_time_get_ns(), and ktime_get() to blk_time_get(), so we have a unified API for querying the current time in nanoseconds or as ktime. No functional changes intended, this patch just wraps ktime_get_ns() and ktime_get() with a block helper. Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- block/bfq-cgroup.c | 14 +++++++------- block/bfq-iosched.c | 28 ++++++++++++++-------------- block/blk-cgroup.c | 2 +- block/blk-flush.c | 2 +- block/blk-iocost.c | 8 ++++---- block/blk-iolatency.c | 6 +++--- block/blk-mq.c | 16 ++++++++-------- block/blk-throttle.c | 6 +++--- block/blk-wbt.c | 6 +++--- block/blk.h | 13 ++++++++++++- 10 files changed, 56 insertions(+), 45 deletions(-) diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index 2c90e5de0acd94..d442ee358fc257 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -127,7 +127,7 @@ static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats) if (!bfqg_stats_waiting(stats)) return; - now = ktime_get_ns(); + now = blk_time_get_ns(); if (now > stats->start_group_wait_time) bfq_stat_add(&stats->group_wait_time, now - stats->start_group_wait_time); @@ -144,7 +144,7 @@ static void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg, return; if (bfqg == curr_bfqg) return; - stats->start_group_wait_time = ktime_get_ns(); + stats->start_group_wait_time = blk_time_get_ns(); bfqg_stats_mark_waiting(stats); } @@ -156,7 +156,7 @@ static void bfqg_stats_end_empty_time(struct bfqg_stats *stats) if (!bfqg_stats_empty(stats)) return; - now = ktime_get_ns(); + now = blk_time_get_ns(); if (now > stats->start_empty_time) bfq_stat_add(&stats->empty_time, now - stats->start_empty_time); @@ -183,7 +183,7 @@ void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) if (bfqg_stats_empty(stats)) return; - stats->start_empty_time = ktime_get_ns(); + stats->start_empty_time = blk_time_get_ns(); bfqg_stats_mark_empty(stats); } @@ -192,7 +192,7 @@ void bfqg_stats_update_idle_time(struct bfq_group *bfqg) struct bfqg_stats *stats = &bfqg->stats; if (bfqg_stats_idling(stats)) { - u64 now = ktime_get_ns(); + u64 now = blk_time_get_ns(); if (now > stats->start_idle_time) bfq_stat_add(&stats->idle_time, @@ -205,7 +205,7 @@ void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { struct bfqg_stats *stats = &bfqg->stats; - stats->start_idle_time = ktime_get_ns(); + stats->start_idle_time = blk_time_get_ns(); bfqg_stats_mark_idling(stats); } @@ -242,7 +242,7 @@ void bfqg_stats_update_completion(struct bfq_group *bfqg, u64 start_time_ns, u64 io_start_time_ns, blk_opf_t opf) { struct bfqg_stats *stats = &bfqg->stats; - u64 now = ktime_get_ns(); + u64 now = blk_time_get_ns(); if (now > io_start_time_ns) blkg_rwstat_add(&stats->service_time, opf, diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 3cce6de464a7b7..4b88a54a9b76cb 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -1005,7 +1005,7 @@ static struct request *bfq_check_fifo(struct bfq_queue *bfqq, rq = rq_entry_fifo(bfqq->fifo.next); - if (rq == last || ktime_get_ns() < rq->fifo_time) + if (rq == last || blk_time_get_ns() < rq->fifo_time) return NULL; bfq_log_bfqq(bfqq->bfqd, bfqq, "check_fifo: returned %p", rq); @@ -1829,7 +1829,7 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, * bfq_bfqq_update_budg_for_activation for * details on the usage of the next variable. */ - arrived_in_time = ktime_get_ns() <= + arrived_in_time = blk_time_get_ns() <= bfqq->ttime.last_end_request + bfqd->bfq_slice_idle * 3; unsigned int act_idx = bfq_actuator_index(bfqd, rq->bio); @@ -2208,7 +2208,7 @@ static void bfq_add_request(struct request *rq) struct request *next_rq, *prev; unsigned int old_wr_coeff = bfqq->wr_coeff; bool interactive = false; - u64 now_ns = ktime_get_ns(); + u64 now_ns = blk_time_get_ns(); bfq_log_bfqq(bfqd, bfqq, "add_request %d", rq_is_sync(rq)); bfqq->queued[rq_is_sync(rq)]++; @@ -2262,7 +2262,7 @@ static void bfq_add_request(struct request *rq) bfqd->rqs_injected && bfqd->tot_rq_in_driver > 0)) && time_is_before_eq_jiffies(bfqq->decrease_time_jif + msecs_to_jiffies(10))) { - bfqd->last_empty_occupied_ns = ktime_get_ns(); + bfqd->last_empty_occupied_ns = blk_time_get_ns(); /* * Start the state machine for measuring the * total service time of rq: setting @@ -3294,7 +3294,7 @@ static void bfq_set_budget_timeout(struct bfq_data *bfqd, else timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight; - bfqd->last_budget_start = ktime_get(); + bfqd->last_budget_start = blk_time_get(); bfqq->budget_timeout = jiffies + bfqd->bfq_timeout * timeout_coeff; @@ -3394,7 +3394,7 @@ static void bfq_arm_slice_timer(struct bfq_data *bfqd) else if (bfqq->wr_coeff > 1) sl = max_t(u32, sl, 20ULL * NSEC_PER_MSEC); - bfqd->last_idling_start = ktime_get(); + bfqd->last_idling_start = blk_time_get(); bfqd->last_idling_start_jiffies = jiffies; hrtimer_start(&bfqd->idle_slice_timer, ns_to_ktime(sl), @@ -3433,7 +3433,7 @@ static void bfq_reset_rate_computation(struct bfq_data *bfqd, struct request *rq) { if (rq != NULL) { /* new rq dispatch now, reset accordingly */ - bfqd->last_dispatch = bfqd->first_dispatch = ktime_get_ns(); + bfqd->last_dispatch = bfqd->first_dispatch = blk_time_get_ns(); bfqd->peak_rate_samples = 1; bfqd->sequential_samples = 0; bfqd->tot_sectors_dispatched = bfqd->last_rq_max_size = @@ -3590,7 +3590,7 @@ static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq) */ static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq) { - u64 now_ns = ktime_get_ns(); + u64 now_ns = blk_time_get_ns(); if (bfqd->peak_rate_samples == 0) { /* first dispatch */ bfq_log(bfqd, "update_peak_rate: goto reset, samples %d", @@ -4162,7 +4162,7 @@ static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq, if (compensate) delta_ktime = bfqd->last_idling_start; else - delta_ktime = ktime_get(); + delta_ktime = blk_time_get(); delta_ktime = ktime_sub(delta_ktime, bfqd->last_budget_start); delta_usecs = ktime_to_us(delta_ktime); @@ -5591,7 +5591,7 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, struct bfq_io_cq *bic, pid_t pid, int is_sync, unsigned int act_idx) { - u64 now_ns = ktime_get_ns(); + u64 now_ns = blk_time_get_ns(); bfqq->actuator_idx = act_idx; RB_CLEAR_NODE(&bfqq->entity.rb_node); @@ -5903,7 +5903,7 @@ static void bfq_update_io_thinktime(struct bfq_data *bfqd, */ if (bfqq->dispatched || bfq_bfqq_busy(bfqq)) return; - elapsed = ktime_get_ns() - bfqq->ttime.last_end_request; + elapsed = blk_time_get_ns() - bfqq->ttime.last_end_request; elapsed = min_t(u64, elapsed, 2ULL * bfqd->bfq_slice_idle); ttime->ttime_samples = (7*ttime->ttime_samples + 256) / 8; @@ -6194,7 +6194,7 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) bfq_add_request(rq); idle_timer_disabled = waiting && !bfq_bfqq_wait_request(bfqq); - rq->fifo_time = ktime_get_ns() + bfqd->bfq_fifo_expire[rq_is_sync(rq)]; + rq->fifo_time = blk_time_get_ns() + bfqd->bfq_fifo_expire[rq_is_sync(rq)]; list_add_tail(&rq->queuelist, &bfqq->fifo); bfq_rq_enqueued(bfqd, bfqq, rq); @@ -6370,7 +6370,7 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) bfq_weights_tree_remove(bfqq); } - now_ns = ktime_get_ns(); + now_ns = blk_time_get_ns(); bfqq->ttime.last_end_request = now_ns; @@ -6585,7 +6585,7 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) static void bfq_update_inject_limit(struct bfq_data *bfqd, struct bfq_queue *bfqq) { - u64 tot_time_ns = ktime_get_ns() - bfqd->last_empty_occupied_ns; + u64 tot_time_ns = blk_time_get_ns() - bfqd->last_empty_occupied_ns; unsigned int old_limit = bfqq->inject_limit; if (bfqq->last_serv_time_ns > 0 && bfqd->rqs_injected) { diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index ff93c385ba5afb..bdbb557feb5a0e 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1846,7 +1846,7 @@ static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay) { unsigned long pflags; bool clamp; - u64 now = ktime_to_ns(ktime_get()); + u64 now = blk_time_get_ns(); u64 exp; u64 delay_nsec = 0; int tok; diff --git a/block/blk-flush.c b/block/blk-flush.c index 3f4d41952ef210..b0f314f4bc1493 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -143,7 +143,7 @@ static void blk_account_io_flush(struct request *rq) part_stat_lock(); part_stat_inc(part, ios[STAT_FLUSH]); part_stat_add(part, nsecs[STAT_FLUSH], - ktime_get_ns() - rq->start_time_ns); + blk_time_get_ns() - rq->start_time_ns); part_stat_unlock(); } diff --git a/block/blk-iocost.c b/block/blk-iocost.c index c8beec6d7df086..4b0b483a969353 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -829,7 +829,7 @@ static int ioc_autop_idx(struct ioc *ioc, struct gendisk *disk) /* step up/down based on the vrate */ vrate_pct = div64_u64(ioc->vtime_base_rate * 100, VTIME_PER_USEC); - now_ns = ktime_get_ns(); + now_ns = blk_time_get_ns(); if (p->too_fast_vrate_pct && p->too_fast_vrate_pct <= vrate_pct) { if (!ioc->autop_too_fast_at) @@ -1044,7 +1044,7 @@ static void ioc_now(struct ioc *ioc, struct ioc_now *now) unsigned seq; u64 vrate; - now->now_ns = ktime_get(); + now->now_ns = blk_time_get_ns(); now->now = ktime_to_us(now->now_ns); vrate = atomic64_read(&ioc->vtime_rate); @@ -2810,7 +2810,7 @@ static void ioc_rqos_done(struct rq_qos *rqos, struct request *rq) return; } - on_q_ns = ktime_get_ns() - rq->alloc_time_ns; + on_q_ns = blk_time_get_ns() - rq->alloc_time_ns; rq_wait_ns = rq->start_time_ns - rq->alloc_time_ns; size_nsec = div64_u64(calc_size_vtime_cost(rq, ioc), VTIME_PER_NSEC); @@ -2893,7 +2893,7 @@ static int blk_iocost_init(struct gendisk *disk) ioc->vtime_base_rate = VTIME_PER_USEC; atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC); seqcount_spinlock_init(&ioc->period_seqcount, &ioc->lock); - ioc->period_at = ktime_to_us(ktime_get()); + ioc->period_at = ktime_to_us(blk_time_get()); atomic64_set(&ioc->cur_period, 0); atomic_set(&ioc->hweight_gen, 0); diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index c1a6aba1d59e4d..ebb522788d9780 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c @@ -609,7 +609,7 @@ static void blkcg_iolatency_done_bio(struct rq_qos *rqos, struct bio *bio) if (!iolat->blkiolat->enabled) return; - now = ktime_to_ns(ktime_get()); + now = blk_time_get_ns(); while (blkg && blkg->parent) { iolat = blkg_to_lat(blkg); if (!iolat) { @@ -661,7 +661,7 @@ static void blkiolatency_timer_fn(struct timer_list *t) struct blk_iolatency *blkiolat = from_timer(blkiolat, t, timer); struct blkcg_gq *blkg; struct cgroup_subsys_state *pos_css; - u64 now = ktime_to_ns(ktime_get()); + u64 now = blk_time_get_ns(); rcu_read_lock(); blkg_for_each_descendant_pre(blkg, pos_css, @@ -985,7 +985,7 @@ static void iolatency_pd_init(struct blkg_policy_data *pd) struct blkcg_gq *blkg = lat_to_blkg(iolat); struct rq_qos *rqos = iolat_rq_qos(blkg->q); struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos); - u64 now = ktime_to_ns(ktime_get()); + u64 now = blk_time_get_ns(); int cpu; if (blk_queue_nonrot(blkg->q)) diff --git a/block/blk-mq.c b/block/blk-mq.c index aa87fcfda1ecfc..aff9e9492f59e4 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -323,7 +323,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq) RB_CLEAR_NODE(&rq->rb_node); rq->tag = BLK_MQ_NO_TAG; rq->internal_tag = BLK_MQ_NO_TAG; - rq->start_time_ns = ktime_get_ns(); + rq->start_time_ns = blk_time_get_ns(); rq->part = NULL; blk_crypto_rq_set_defaults(rq); } @@ -333,7 +333,7 @@ EXPORT_SYMBOL(blk_rq_init); static inline void blk_mq_rq_time_init(struct request *rq, u64 alloc_time_ns) { if (blk_mq_need_time_stamp(rq)) - rq->start_time_ns = ktime_get_ns(); + rq->start_time_ns = blk_time_get_ns(); else rq->start_time_ns = 0; @@ -444,7 +444,7 @@ static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data) /* alloc_time includes depth and tag waits */ if (blk_queue_rq_alloc_time(q)) - alloc_time_ns = ktime_get_ns(); + alloc_time_ns = blk_time_get_ns(); if (data->cmd_flags & REQ_NOWAIT) data->flags |= BLK_MQ_REQ_NOWAIT; @@ -629,7 +629,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, /* alloc_time includes depth and tag waits */ if (blk_queue_rq_alloc_time(q)) - alloc_time_ns = ktime_get_ns(); + alloc_time_ns = blk_time_get_ns(); /* * If the tag allocator sleeps we could get an allocation for a @@ -1042,7 +1042,7 @@ static inline void __blk_mq_end_request_acct(struct request *rq, u64 now) inline void __blk_mq_end_request(struct request *rq, blk_status_t error) { if (blk_mq_need_time_stamp(rq)) - __blk_mq_end_request_acct(rq, ktime_get_ns()); + __blk_mq_end_request_acct(rq, blk_time_get_ns()); blk_mq_finish_request(rq); @@ -1085,7 +1085,7 @@ void blk_mq_end_request_batch(struct io_comp_batch *iob) u64 now = 0; if (iob->need_ts) - now = ktime_get_ns(); + now = blk_time_get_ns(); while ((rq = rq_list_pop(&iob->req_list)) != NULL) { prefetch(rq->bio); @@ -1255,7 +1255,7 @@ void blk_mq_start_request(struct request *rq) if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags) && !blk_rq_is_passthrough(rq)) { - rq->io_start_time_ns = ktime_get_ns(); + rq->io_start_time_ns = blk_time_get_ns(); rq->stats_sectors = blk_rq_sectors(rq); rq->rq_flags |= RQF_STATS; rq_qos_issue(q, rq); @@ -3107,7 +3107,7 @@ blk_status_t blk_insert_cloned_request(struct request *rq) blk_mq_run_dispatch_ops(q, ret = blk_mq_request_issue_directly(rq, true)); if (ret) - blk_account_io_done(rq, ktime_get_ns()); + blk_account_io_done(rq, blk_time_get_ns()); return ret; } EXPORT_SYMBOL_GPL(blk_insert_cloned_request); diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 16f5766620a410..da9dc1f793c3b7 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -1815,7 +1815,7 @@ static bool throtl_tg_is_idle(struct throtl_grp *tg) time = min_t(unsigned long, MAX_IDLE_TIME, 4 * tg->idletime_threshold); ret = tg->latency_target == DFL_LATENCY_TARGET || tg->idletime_threshold == DFL_IDLE_THRESHOLD || - (ktime_get_ns() >> 10) - tg->last_finish_time > time || + (blk_time_get_ns() >> 10) - tg->last_finish_time > time || tg->avg_idletime > tg->idletime_threshold || (tg->latency_target && tg->bio_cnt && tg->bad_bio_cnt * 5 < tg->bio_cnt); @@ -2060,7 +2060,7 @@ static void blk_throtl_update_idletime(struct throtl_grp *tg) if (last_finish_time == 0) return; - now = ktime_get_ns() >> 10; + now = blk_time_get_ns() >> 10; if (now <= last_finish_time || last_finish_time == tg->checked_last_finish_time) return; @@ -2327,7 +2327,7 @@ void blk_throtl_bio_endio(struct bio *bio) if (!tg->td->limit_valid[LIMIT_LOW]) return; - finish_time_ns = ktime_get_ns(); + finish_time_ns = blk_time_get_ns(); tg->last_finish_time = finish_time_ns >> 10; start_time = bio_issue_time(&bio->bi_issue) >> 10; diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 5ba3cd574eacbd..8cb53bf4c7b152 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -29,6 +29,7 @@ #include "blk-wbt.h" #include "blk-rq-qos.h" #include "elevator.h" +#include "blk.h" #define CREATE_TRACE_POINTS #include @@ -274,13 +275,12 @@ static inline bool stat_sample_valid(struct blk_rq_stat *stat) static u64 rwb_sync_issue_lat(struct rq_wb *rwb) { - u64 now, issue = READ_ONCE(rwb->sync_issue); + u64 issue = READ_ONCE(rwb->sync_issue); if (!issue || !rwb->sync_cookie) return 0; - now = ktime_to_ns(ktime_get()); - return now - issue; + return blk_time_get_ns() - issue; } static inline unsigned int wbt_inflight(struct rq_wb *rwb) diff --git a/block/blk.h b/block/blk.h index 620e3a035da116..79ae533cdf026f 100644 --- a/block/blk.h +++ b/block/blk.h @@ -4,6 +4,7 @@ #include #include /* for max_pfn/max_low_pfn */ +#include #include #include "blk-crypto-internal.h" @@ -516,6 +517,16 @@ static inline int req_ref_read(struct request *req) return atomic_read(&req->ref); } +static inline u64 blk_time_get_ns(void) +{ + return ktime_get_ns(); +} + +static inline ktime_t blk_time_get(void) +{ + return ns_to_ktime(blk_time_get_ns()); +} + /* * From most significant bit: * 1 bit: reserved for other usage, see below @@ -554,7 +565,7 @@ static inline void bio_issue_init(struct bio_issue *issue, { size &= (1ULL << BIO_ISSUE_SIZE_BITS) - 1; issue->value = ((issue->value & BIO_ISSUE_RES_MASK) | - (ktime_get_ns() & BIO_ISSUE_TIME_MASK) | + (blk_time_get_ns() & BIO_ISSUE_TIME_MASK) | ((u64)size << BIO_ISSUE_SIZE_SHIFT)); } From 1ff288ab5cf42de6b12cbcdb1d0db1ea73723656 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 15 Jan 2024 14:46:03 -0700 Subject: [PATCH 242/707] block: cache current nsec time in struct blk_plug Querying the current time is the most costly thing we do in the block layer per IO, and depending on kernel config settings, we may do it many times per IO. None of the callers actually need nsec granularity. Take advantage of that by caching the current time in the plug, with the assumption here being that any time checking will be temporally close enough that the slight loss of precision doesn't matter. If the block plug gets flushed, eg on preempt or schedule out, then we invalidate the cached clock. On a basic peak IOPS test case with iostats enabled, this changes the performance from: IOPS=108.41M, BW=52.93GiB/s, IOS/call=31/31 IOPS=108.43M, BW=52.94GiB/s, IOS/call=32/32 IOPS=108.29M, BW=52.88GiB/s, IOS/call=31/32 IOPS=108.35M, BW=52.91GiB/s, IOS/call=32/32 IOPS=108.42M, BW=52.94GiB/s, IOS/call=31/31 IOPS=108.40M, BW=52.93GiB/s, IOS/call=32/32 IOPS=108.31M, BW=52.89GiB/s, IOS/call=32/31 to IOPS=118.79M, BW=58.00GiB/s, IOS/call=31/32 IOPS=118.62M, BW=57.92GiB/s, IOS/call=31/31 IOPS=118.80M, BW=58.01GiB/s, IOS/call=32/31 IOPS=118.78M, BW=58.00GiB/s, IOS/call=32/32 IOPS=118.69M, BW=57.95GiB/s, IOS/call=32/31 IOPS=118.62M, BW=57.92GiB/s, IOS/call=32/31 IOPS=118.63M, BW=57.92GiB/s, IOS/call=31/32 which is more than a 9% improvement in performance. Looking at perf diff, we can see a huge reduction in time overhead: 10.55% -9.88% [kernel.vmlinux] [k] read_tsc 1.31% -1.22% [kernel.vmlinux] [k] ktime_get Note that since this relies on blk_plug for the caching, it's only applicable to the issue side. But this is where most of the time calls happen anyway. On the completion side, cached time stamping is done with struct io_comp patch, as long as the driver supports it. It's also worth noting that the above testing doesn't enable any of the higher cost CPU items on the block layer side, like wbt, cgroups, iocost, etc, which all would add additional time querying and hence overhead. IOW, results would likely look even better in comparison with those enabled, as distros would do. Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- block/blk-core.c | 1 + block/blk.h | 14 +++++++++++++- include/linux/blkdev.h | 1 + 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/block/blk-core.c b/block/blk-core.c index 11342af420d0c4..cc4db4d92c75b4 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1073,6 +1073,7 @@ void blk_start_plug_nr_ios(struct blk_plug *plug, unsigned short nr_ios) if (tsk->plug) return; + plug->cur_ktime = 0; plug->mq_list = NULL; plug->cached_rq = NULL; plug->nr_ios = min_t(unsigned short, nr_ios, BLK_MAX_REQUEST_COUNT); diff --git a/block/blk.h b/block/blk.h index 79ae533cdf026f..14bbc4b780f275 100644 --- a/block/blk.h +++ b/block/blk.h @@ -519,7 +519,19 @@ static inline int req_ref_read(struct request *req) static inline u64 blk_time_get_ns(void) { - return ktime_get_ns(); + struct blk_plug *plug = current->plug; + + if (!plug) + return ktime_get_ns(); + + /* + * 0 could very well be a valid time, but rather than flag "this is + * a valid timestamp" separately, just accept that we'll do an extra + * ktime_get_ns() if we just happen to get 0 as the current time. + */ + if (!plug->cur_ktime) + plug->cur_ktime = ktime_get_ns(); + return plug->cur_ktime; } static inline ktime_t blk_time_get(void) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 99e4f5e722132c..996d2ad756ff78 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -942,6 +942,7 @@ struct blk_plug { /* if ios_left is > 1, we can batch tag/rq allocations */ struct request *cached_rq; + u64 cur_ktime; unsigned short nr_ios; unsigned short rq_count; From 1a3d70bf0c0e9f711b1d653d05df201eee68ab5b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 16 Jan 2024 09:18:39 -0700 Subject: [PATCH 243/707] block: update cached timestamp post schedule/preemption Mark the task as having a cached timestamp when set assign it, so we can efficiently check if it needs updating post being scheduled back in. This covers both the actual schedule out case, which would've flushed the plug, and the preemption case which doesn't touch the plugged requests (for many reasons, one of them being then we'd need to have preemption disabled around plug state manipulation). Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- block/blk-core.c | 2 ++ block/blk.h | 4 +++- include/linux/blkdev.h | 16 ++++++++++++++++ include/linux/sched.h | 2 +- kernel/sched/core.c | 6 ++++-- 5 files changed, 26 insertions(+), 4 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index cc4db4d92c75b4..71c6614a97fefb 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1173,6 +1173,8 @@ void __blk_flush_plug(struct blk_plug *plug, bool from_schedule) */ if (unlikely(!rq_list_empty(plug->cached_rq))) blk_mq_free_plug_rqs(plug); + + current->flags &= ~PF_BLOCK_TS; } /** diff --git a/block/blk.h b/block/blk.h index 14bbc4b780f275..913c93838a01bf 100644 --- a/block/blk.h +++ b/block/blk.h @@ -529,8 +529,10 @@ static inline u64 blk_time_get_ns(void) * a valid timestamp" separately, just accept that we'll do an extra * ktime_get_ns() if we just happen to get 0 as the current time. */ - if (!plug->cur_ktime) + if (!plug->cur_ktime) { plug->cur_ktime = ktime_get_ns(); + current->flags |= PF_BLOCK_TS; + } return plug->cur_ktime; } diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 996d2ad756ff78..d7cac3de65b31b 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -973,6 +973,18 @@ static inline void blk_flush_plug(struct blk_plug *plug, bool async) __blk_flush_plug(plug, async); } +/* + * tsk == current here + */ +static inline void blk_plug_invalidate_ts(struct task_struct *tsk) +{ + struct blk_plug *plug = tsk->plug; + + if (plug) + plug->cur_ktime = 0; + current->flags &= ~PF_BLOCK_TS; +} + int blkdev_issue_flush(struct block_device *bdev); long nr_blockdev_pages(void); #else /* CONFIG_BLOCK */ @@ -996,6 +1008,10 @@ static inline void blk_flush_plug(struct blk_plug *plug, bool async) { } +static inline void blk_plug_invalidate_ts(struct task_struct *tsk) +{ +} + static inline int blkdev_issue_flush(struct block_device *bdev) { return 0; diff --git a/include/linux/sched.h b/include/linux/sched.h index cdb8ea53c365ba..801233cef2fc9d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1642,7 +1642,7 @@ extern struct pid *cad_pid; #define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */ #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ #define PF_MEMALLOC_PIN 0x10000000 /* Allocation context constrained to zones which allow long term pinning. */ -#define PF__HOLE__20000000 0x20000000 +#define PF_BLOCK_TS 0x20000000 /* plug has ts that needs updating */ #define PF__HOLE__40000000 0x40000000 #define PF_SUSPEND_TASK 0x80000000 /* This thread called freeze_processes() and should not be frozen */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9116bcc903467f..083f2258182d89 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6787,10 +6787,12 @@ static inline void sched_submit_work(struct task_struct *tsk) static void sched_update_worker(struct task_struct *tsk) { - if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { + if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER | PF_BLOCK_TS)) { + if (tsk->flags & PF_BLOCK_TS) + blk_plug_invalidate_ts(tsk); if (tsk->flags & PF_WQ_WORKER) wq_worker_running(tsk); - else + else if (tsk->flags & PF_IO_WORKER) io_wq_worker_running(tsk); } } From b588e2d813c85a2ecbaf3d64269dfde4bb735917 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 22 Jan 2024 16:34:10 +0100 Subject: [PATCH 244/707] nvmem: include bit index in cell sysfs file name MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Creating sysfs files for all Cells caused a boot failure for linux-6.8-rc1 on Apple M1, which (in downstream dts files) has multiple nvmem cells that use the same byte address. This causes the device probe to fail with [ 0.605336] sysfs: cannot create duplicate filename '/devices/platform/soc@200000000/2922bc000.efuse/apple_efuses_nvmem0/cells/efuse@a10' [ 0.605347] CPU: 7 PID: 1 Comm: swapper/0 Tainted: G S 6.8.0-rc1-arnd-5+ #133 [ 0.605355] Hardware name: Apple Mac Studio (M1 Ultra, 2022) (DT) [ 0.605362] Call trace: [ 0.605365] show_stack+0x18/0x2c [ 0.605374] dump_stack_lvl+0x60/0x80 [ 0.605383] dump_stack+0x18/0x24 [ 0.605388] sysfs_warn_dup+0x64/0x80 [ 0.605395] sysfs_add_bin_file_mode_ns+0xb0/0xd4 [ 0.605402] internal_create_group+0x268/0x404 [ 0.605409] sysfs_create_groups+0x38/0x94 [ 0.605415] devm_device_add_groups+0x50/0x94 [ 0.605572] nvmem_populate_sysfs_cells+0x180/0x1b0 [ 0.605682] nvmem_register+0x38c/0x470 [ 0.605789] devm_nvmem_register+0x1c/0x6c [ 0.605895] apple_efuses_probe+0xe4/0x120 [ 0.606000] platform_probe+0xa8/0xd0 As far as I can tell, this is a problem for any device with multiple cells on different bits of the same address. Avoid the issue by changing the file name to include the first bit number. Fixes: 0331c611949f ("nvmem: core: Expose cells through sysfs") Link: https://github.com/AsahiLinux/linux/blob/bd0a1a7d4/arch/arm64/boot/dts/apple/t600x-dieX.dtsi#L156 Cc: regressions@lists.linux.dev Cc: Miquel Raynal Cc: Rafał Miłecki Cc: Chen-Yu Tsai Cc: Srinivas Kandagatla Cc: Greg Kroah-Hartman Cc: asahi@lists.linux.dev Cc: Sven Peter Signed-off-by: Arnd Bergmann Signed-off-by: Srinivas Kandagatla --- Documentation/ABI/testing/sysfs-nvmem-cells | 16 ++++++++-------- drivers/nvmem/core.c | 5 +++-- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-nvmem-cells b/Documentation/ABI/testing/sysfs-nvmem-cells index 7af70adf3690e3..c7c9444f92a880 100644 --- a/Documentation/ABI/testing/sysfs-nvmem-cells +++ b/Documentation/ABI/testing/sysfs-nvmem-cells @@ -4,18 +4,18 @@ KernelVersion: 6.5 Contact: Miquel Raynal Description: The "cells" folder contains one file per cell exposed by the - NVMEM device. The name of the file is: @, with - being the cell name and its location in the NVMEM - device, in hexadecimal (without the '0x' prefix, to mimic device - tree node names). The length of the file is the size of the cell - (when known). The content of the file is the binary content of - the cell (may sometimes be ASCII, likely without trailing - character). + NVMEM device. The name of the file is: "@,", + with being the cell name and its location in + the NVMEM device, in hexadecimal bytes and bits (without the + '0x' prefix, to mimic device tree node names). The length of + the file is the size of the cell (when known). The content of + the file is the binary content of the cell (may sometimes be + ASCII, likely without trailing character). Note: This file is only present if CONFIG_NVMEM_SYSFS is enabled. Example:: - hexdump -C /sys/bus/nvmem/devices/1-00563/cells/product-name@d + hexdump -C /sys/bus/nvmem/devices/1-00563/cells/product-name@d,0 00000000 54 4e 34 38 4d 2d 50 2d 44 4e |TN48M-P-DN| 0000000a diff --git a/drivers/nvmem/core.c b/drivers/nvmem/core.c index 980123fb4dde05..eb357ac2e54a2a 100644 --- a/drivers/nvmem/core.c +++ b/drivers/nvmem/core.c @@ -460,8 +460,9 @@ static int nvmem_populate_sysfs_cells(struct nvmem_device *nvmem) list_for_each_entry(entry, &nvmem->cells, node) { sysfs_bin_attr_init(&attrs[i]); attrs[i].attr.name = devm_kasprintf(&nvmem->dev, GFP_KERNEL, - "%s@%x", entry->name, - entry->offset); + "%s@%x,%x", entry->name, + entry->offset, + entry->bit_offset); attrs[i].attr.mode = 0444; attrs[i].size = entry->bytes; attrs[i].read = &nvmem_cell_attr_read; From aa8eff72842021f52600392b245fb82d113afa8a Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 26 Jan 2024 17:39:19 +0100 Subject: [PATCH 245/707] x86/sme: Fix memory encryption setting if enabled by default and not overridden Commit cbebd68f59f0 ("x86/mm: Fix use of uninitialized buffer in sme_enable()") 'fixed' an issue in sme_enable() detected by static analysis, and broke the common case in the process. cmdline_find_option() will return < 0 on an error, or when the command line argument does not appear at all. In this particular case, the latter is not an error condition, and so the early exit is wrong. Instead, without mem_encrypt= on the command line, the compile time default should be honoured, which could be to enable memory encryption, and this is currently broken. Fix it by setting sme_me_mask to a preliminary value based on the compile time default, and only omitting the command line argument test when cmdline_find_option() returns an error. [ bp: Drop active_by_default while at it. ] Fixes: cbebd68f59f0 ("x86/mm: Fix use of uninitialized buffer in sme_enable()") Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Tom Lendacky Link: https://lore.kernel.org/r/20240126163918.2908990-2-ardb+git@google.com --- arch/x86/mm/mem_encrypt_identity.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c index d73aeb16417fcf..7f72472a34d6d9 100644 --- a/arch/x86/mm/mem_encrypt_identity.c +++ b/arch/x86/mm/mem_encrypt_identity.c @@ -507,7 +507,6 @@ void __init sme_enable(struct boot_params *bp) const char *cmdline_ptr, *cmdline_arg, *cmdline_on, *cmdline_off; unsigned int eax, ebx, ecx, edx; unsigned long feature_mask; - bool active_by_default; unsigned long me_mask; char buffer[16]; bool snp; @@ -593,22 +592,19 @@ void __init sme_enable(struct boot_params *bp) : "p" (sme_cmdline_off)); if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT)) - active_by_default = true; - else - active_by_default = false; + sme_me_mask = me_mask; cmdline_ptr = (const char *)((u64)bp->hdr.cmd_line_ptr | ((u64)bp->ext_cmd_line_ptr << 32)); if (cmdline_find_option(cmdline_ptr, cmdline_arg, buffer, sizeof(buffer)) < 0) - return; + goto out; if (!strncmp(buffer, cmdline_on, sizeof(buffer))) sme_me_mask = me_mask; else if (!strncmp(buffer, cmdline_off, sizeof(buffer))) sme_me_mask = 0; - else - sme_me_mask = active_by_default ? me_mask : 0; + out: if (sme_me_mask) { physical_mask &= ~sme_me_mask; From 290ea1d6f990c92d5ae599554d8acafdd0ddeb2e Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Thu, 25 Jan 2024 07:46:44 -0600 Subject: [PATCH 246/707] nvmem: fixed-cell: Simplify nested if/then schema There's no reason to have a nested if/then schema as checking for compatible being present and containing 'mac-base' can all be done in one 'if' schema. Signed-off-by: Rob Herring Signed-off-by: Srinivas Kandagatla --- .../bindings/nvmem/layouts/fixed-cell.yaml | 22 +++++++++---------- 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/Documentation/devicetree/bindings/nvmem/layouts/fixed-cell.yaml b/Documentation/devicetree/bindings/nvmem/layouts/fixed-cell.yaml index ac2381e6602790..8b3826243dddfc 100644 --- a/Documentation/devicetree/bindings/nvmem/layouts/fixed-cell.yaml +++ b/Documentation/devicetree/bindings/nvmem/layouts/fixed-cell.yaml @@ -36,20 +36,18 @@ properties: allOf: - if: + properties: + compatible: + contains: + const: mac-base required: [ compatible ] then: - if: - properties: - compatible: - contains: - const: mac-base - then: - properties: - "#nvmem-cell-cells": - description: The first argument is a MAC address offset. - const: 1 - required: - - "#nvmem-cell-cells" + properties: + "#nvmem-cell-cells": + description: The first argument is a MAC address offset. + const: 1 + required: + - "#nvmem-cell-cells" required: - reg From 4e6102d60d88975bc65a0dde05a8ba096c450249 Mon Sep 17 00:00:00 2001 From: William-tw Lin Date: Fri, 22 Dec 2023 16:07:39 +0800 Subject: [PATCH 247/707] nvmem: mtk-efuse: Register MediaTek socinfo driver from efuse The socinfo driver reads chip information from eFuses and does not need any devicetree node. Register it from mtk-efuse. While at it, also add the name for this driver's nvmem_config. Signed-off-by: William-tw Lin Reviewed-by: AngeloGioacchino Del Regno Signed-off-by: Srinivas Kandagatla --- drivers/nvmem/mtk-efuse.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/drivers/nvmem/mtk-efuse.c b/drivers/nvmem/mtk-efuse.c index 84f05b40a4112e..f5bebcecf9bd31 100644 --- a/drivers/nvmem/mtk-efuse.c +++ b/drivers/nvmem/mtk-efuse.c @@ -68,6 +68,7 @@ static int mtk_efuse_probe(struct platform_device *pdev) struct nvmem_config econfig = {}; struct mtk_efuse_priv *priv; const struct mtk_efuse_pdata *pdata; + struct platform_device *socinfo; priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL); if (!priv) @@ -85,11 +86,20 @@ static int mtk_efuse_probe(struct platform_device *pdev) econfig.size = resource_size(res); econfig.priv = priv; econfig.dev = dev; + econfig.name = "mtk-efuse"; if (pdata->uses_post_processing) econfig.fixup_dt_cell_info = &mtk_efuse_fixup_dt_cell_info; nvmem = devm_nvmem_register(dev, &econfig); + if (IS_ERR(nvmem)) + return PTR_ERR(nvmem); - return PTR_ERR_OR_ZERO(nvmem); + socinfo = platform_device_register_data(&pdev->dev, "mtk-socinfo", + PLATFORM_DEVID_AUTO, NULL, 0); + if (IS_ERR(socinfo)) + dev_info(dev, "MediaTek SoC Information will be unavailable\n"); + + platform_set_drvdata(pdev, socinfo); + return 0; } static const struct mtk_efuse_pdata mtk_mt8186_efuse_pdata = { @@ -108,8 +118,17 @@ static const struct of_device_id mtk_efuse_of_match[] = { }; MODULE_DEVICE_TABLE(of, mtk_efuse_of_match); +static void mtk_efuse_remove(struct platform_device *pdev) +{ + struct platform_device *socinfo = platform_get_drvdata(pdev); + + if (!IS_ERR_OR_NULL(socinfo)) + platform_device_unregister(socinfo); +} + static struct platform_driver mtk_efuse_driver = { .probe = mtk_efuse_probe, + .remove_new = mtk_efuse_remove, .driver = { .name = "mediatek,efuse", .of_match_table = mtk_efuse_of_match, From a0cfd5e997824d0bd8c7620d40cdb324121a2fc7 Mon Sep 17 00:00:00 2001 From: Praveen Teja Kundanala Date: Mon, 8 Jan 2024 10:56:16 +0530 Subject: [PATCH 248/707] dt-bindings: nvmem: Convert xlnx,zynqmp-nvmem.txt to yaml Convert the xlnx,zynqmp-nvmem.txt to yaml. Signed-off-by: Praveen Teja Kundanala Reviewed-by: Krzysztof Kozlowski Signed-off-by: Srinivas Kandagatla --- .../bindings/nvmem/xlnx,zynqmp-nvmem.txt | 46 ------------------- .../bindings/nvmem/xlnx,zynqmp-nvmem.yaml | 42 +++++++++++++++++ 2 files changed, 42 insertions(+), 46 deletions(-) delete mode 100644 Documentation/devicetree/bindings/nvmem/xlnx,zynqmp-nvmem.txt create mode 100644 Documentation/devicetree/bindings/nvmem/xlnx,zynqmp-nvmem.yaml diff --git a/Documentation/devicetree/bindings/nvmem/xlnx,zynqmp-nvmem.txt b/Documentation/devicetree/bindings/nvmem/xlnx,zynqmp-nvmem.txt deleted file mode 100644 index 4881561b3a02ac..00000000000000 --- a/Documentation/devicetree/bindings/nvmem/xlnx,zynqmp-nvmem.txt +++ /dev/null @@ -1,46 +0,0 @@ --------------------------------------------------------------------------- -= Zynq UltraScale+ MPSoC nvmem firmware driver binding = --------------------------------------------------------------------------- -The nvmem_firmware node provides access to the hardware related data -like soc revision, IDCODE... etc, By using the firmware interface. - -Required properties: -- compatible: should be "xlnx,zynqmp-nvmem-fw" - -= Data cells = -Are child nodes of silicon id, bindings of which as described in -bindings/nvmem/nvmem.txt - -------- - Example -------- -firmware { - zynqmp_firmware: zynqmp-firmware { - compatible = "xlnx,zynqmp-firmware"; - method = "smc"; - - nvmem_firmware { - compatible = "xlnx,zynqmp-nvmem-fw"; - #address-cells = <1>; - #size-cells = <1>; - - /* Data cells */ - soc_revision: soc_revision { - reg = <0x0 0x4>; - }; - }; - }; -}; - -= Data consumers = -Are device nodes which consume nvmem data cells. - -For example: - pcap { - ... - - nvmem-cells = <&soc_revision>; - nvmem-cell-names = "soc_revision"; - - ... - }; diff --git a/Documentation/devicetree/bindings/nvmem/xlnx,zynqmp-nvmem.yaml b/Documentation/devicetree/bindings/nvmem/xlnx,zynqmp-nvmem.yaml new file mode 100644 index 00000000000000..917c40d5c382f4 --- /dev/null +++ b/Documentation/devicetree/bindings/nvmem/xlnx,zynqmp-nvmem.yaml @@ -0,0 +1,42 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/nvmem/xlnx,zynqmp-nvmem.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Zynq UltraScale+ MPSoC Non Volatile Memory interface + +description: | + The ZynqMP MPSoC provides access to the hardware related data + like SOC revision, IDCODE and specific purpose efuses. + +maintainers: + - Kalyani Akula + - Praveen Teja Kundanala + +allOf: + - $ref: nvmem.yaml# + +properties: + compatible: + const: xlnx,zynqmp-nvmem-fw + +required: + - compatible + +unevaluatedProperties: false + +examples: + - | + nvmem { + compatible = "xlnx,zynqmp-nvmem-fw"; + nvmem-layout { + compatible = "fixed-layout"; + #address-cells = <1>; + #size-cells = <1>; + + soc_revision: soc-revision@0 { + reg = <0x0 0x4>; + }; + }; + }; From 2f423b541ace886a109d4a03799ef14c22129ccb Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sat, 27 Jan 2024 16:02:54 +0100 Subject: [PATCH 249/707] hwmon: Remove I2C_CLASS_HWMON from drivers w/o detect() and address_list Class-based I2C probing requires detect() and address_list to be set in the I2C client driver, see checks in i2c_detect(). It's misleading to declare I2C_CLASS_HWMON support if this precondition isn't met. Signed-off-by: Heiner Kallweit Link: https://lore.kernel.org/r/75747c6a-d414-4b07-8f66-5a5cdddc3c36@gmail.com Signed-off-by: Guenter Roeck --- drivers/hwmon/adm1177.c | 1 - drivers/hwmon/ds1621.c | 1 - drivers/hwmon/ds620.c | 1 - drivers/hwmon/ina209.c | 1 - drivers/hwmon/ina238.c | 1 - drivers/hwmon/max127.c | 1 - drivers/hwmon/max31760.c | 1 - drivers/hwmon/max31790.c | 1 - drivers/hwmon/max31827.c | 1 - drivers/hwmon/max6621.c | 1 - drivers/hwmon/max6697.c | 1 - drivers/hwmon/occ/p8_i2c.c | 1 - drivers/hwmon/pmbus/ir36021.c | 1 - drivers/hwmon/powr1220.c | 1 - drivers/hwmon/sbrmi.c | 1 - drivers/hwmon/sbtsi_temp.c | 1 - drivers/hwmon/w83773g.c | 1 - 17 files changed, 17 deletions(-) diff --git a/drivers/hwmon/adm1177.c b/drivers/hwmon/adm1177.c index 60a893f271597b..3390102d2d4acf 100644 --- a/drivers/hwmon/adm1177.c +++ b/drivers/hwmon/adm1177.c @@ -250,7 +250,6 @@ static const struct of_device_id adm1177_dt_ids[] = { MODULE_DEVICE_TABLE(of, adm1177_dt_ids); static struct i2c_driver adm1177_driver = { - .class = I2C_CLASS_HWMON, .driver = { .name = "adm1177", .of_match_table = adm1177_dt_ids, diff --git a/drivers/hwmon/ds1621.c b/drivers/hwmon/ds1621.c index 21b6350465211f..bffbc80401718d 100644 --- a/drivers/hwmon/ds1621.c +++ b/drivers/hwmon/ds1621.c @@ -380,7 +380,6 @@ MODULE_DEVICE_TABLE(i2c, ds1621_id); /* This is the driver that will be inserted */ static struct i2c_driver ds1621_driver = { - .class = I2C_CLASS_HWMON, .driver = { .name = "ds1621", }, diff --git a/drivers/hwmon/ds620.c b/drivers/hwmon/ds620.c index 2b09536630cb6a..4fc4df012fac70 100644 --- a/drivers/hwmon/ds620.c +++ b/drivers/hwmon/ds620.c @@ -241,7 +241,6 @@ MODULE_DEVICE_TABLE(i2c, ds620_id); /* This is the driver that will be inserted */ static struct i2c_driver ds620_driver = { - .class = I2C_CLASS_HWMON, .driver = { .name = "ds620", }, diff --git a/drivers/hwmon/ina209.c b/drivers/hwmon/ina209.c index c558143e5285da..d9b57a4b3e41fb 100644 --- a/drivers/hwmon/ina209.c +++ b/drivers/hwmon/ina209.c @@ -589,7 +589,6 @@ MODULE_DEVICE_TABLE(of, ina209_of_match); /* This is the driver that will be inserted */ static struct i2c_driver ina209_driver = { - .class = I2C_CLASS_HWMON, .driver = { .name = "ina209", .of_match_table = of_match_ptr(ina209_of_match), diff --git a/drivers/hwmon/ina238.c b/drivers/hwmon/ina238.c index ca9f5d2c811bb6..69289293bc38cc 100644 --- a/drivers/hwmon/ina238.c +++ b/drivers/hwmon/ina238.c @@ -629,7 +629,6 @@ static const struct of_device_id __maybe_unused ina238_of_match[] = { MODULE_DEVICE_TABLE(of, ina238_of_match); static struct i2c_driver ina238_driver = { - .class = I2C_CLASS_HWMON, .driver = { .name = "ina238", .of_match_table = of_match_ptr(ina238_of_match), diff --git a/drivers/hwmon/max127.c b/drivers/hwmon/max127.c index ee5ead06d612df..da2289e3560aa0 100644 --- a/drivers/hwmon/max127.c +++ b/drivers/hwmon/max127.c @@ -335,7 +335,6 @@ static const struct i2c_device_id max127_id[] = { MODULE_DEVICE_TABLE(i2c, max127_id); static struct i2c_driver max127_driver = { - .class = I2C_CLASS_HWMON, .driver = { .name = "max127", }, diff --git a/drivers/hwmon/max31760.c b/drivers/hwmon/max31760.c index 79945eb466ae43..1b6f71bc61cb54 100644 --- a/drivers/hwmon/max31760.c +++ b/drivers/hwmon/max31760.c @@ -578,7 +578,6 @@ static DEFINE_SIMPLE_DEV_PM_OPS(max31760_pm_ops, max31760_suspend, max31760_resume); static struct i2c_driver max31760_driver = { - .class = I2C_CLASS_HWMON, .driver = { .name = "max31760", .of_match_table = max31760_of_match, diff --git a/drivers/hwmon/max31790.c b/drivers/hwmon/max31790.c index 0cd44c1e998a2e..3dc95196b229ad 100644 --- a/drivers/hwmon/max31790.c +++ b/drivers/hwmon/max31790.c @@ -543,7 +543,6 @@ static const struct i2c_device_id max31790_id[] = { MODULE_DEVICE_TABLE(i2c, max31790_id); static struct i2c_driver max31790_driver = { - .class = I2C_CLASS_HWMON, .probe = max31790_probe, .driver = { .name = "max31790", diff --git a/drivers/hwmon/max31827.c b/drivers/hwmon/max31827.c index 4a8c3e37c5d323..f8a13b30f100f2 100644 --- a/drivers/hwmon/max31827.c +++ b/drivers/hwmon/max31827.c @@ -652,7 +652,6 @@ static const struct of_device_id max31827_of_match[] = { MODULE_DEVICE_TABLE(of, max31827_of_match); static struct i2c_driver max31827_driver = { - .class = I2C_CLASS_HWMON, .driver = { .name = "max31827", .of_match_table = max31827_of_match, diff --git a/drivers/hwmon/max6621.c b/drivers/hwmon/max6621.c index af7e6268589831..05426cde0e3635 100644 --- a/drivers/hwmon/max6621.c +++ b/drivers/hwmon/max6621.c @@ -549,7 +549,6 @@ static const struct of_device_id __maybe_unused max6621_of_match[] = { MODULE_DEVICE_TABLE(of, max6621_of_match); static struct i2c_driver max6621_driver = { - .class = I2C_CLASS_HWMON, .driver = { .name = MAX6621_DRV_NAME, .of_match_table = of_match_ptr(max6621_of_match), diff --git a/drivers/hwmon/max6697.c b/drivers/hwmon/max6697.c index 7d10dd434f2e11..d161ba0e7813cd 100644 --- a/drivers/hwmon/max6697.c +++ b/drivers/hwmon/max6697.c @@ -780,7 +780,6 @@ static const struct of_device_id __maybe_unused max6697_of_match[] = { MODULE_DEVICE_TABLE(of, max6697_of_match); static struct i2c_driver max6697_driver = { - .class = I2C_CLASS_HWMON, .driver = { .name = "max6697", .of_match_table = of_match_ptr(max6697_of_match), diff --git a/drivers/hwmon/occ/p8_i2c.c b/drivers/hwmon/occ/p8_i2c.c index 06095975f5c8d2..31159606cec7b4 100644 --- a/drivers/hwmon/occ/p8_i2c.c +++ b/drivers/hwmon/occ/p8_i2c.c @@ -241,7 +241,6 @@ static const struct of_device_id p8_i2c_occ_of_match[] = { MODULE_DEVICE_TABLE(of, p8_i2c_occ_of_match); static struct i2c_driver p8_i2c_occ_driver = { - .class = I2C_CLASS_HWMON, .driver = { .name = "occ-hwmon", .of_match_table = p8_i2c_occ_of_match, diff --git a/drivers/hwmon/pmbus/ir36021.c b/drivers/hwmon/pmbus/ir36021.c index 382ba6b6031a03..a263afeb8ac1e1 100644 --- a/drivers/hwmon/pmbus/ir36021.c +++ b/drivers/hwmon/pmbus/ir36021.c @@ -63,7 +63,6 @@ static const struct of_device_id __maybe_unused ir36021_of_id[] = { MODULE_DEVICE_TABLE(of, ir36021_of_id); static struct i2c_driver ir36021_driver = { - .class = I2C_CLASS_HWMON, .driver = { .name = "ir36021", .of_match_table = of_match_ptr(ir36021_of_id), diff --git a/drivers/hwmon/powr1220.c b/drivers/hwmon/powr1220.c index 4120cadb00aebc..2388d0565e7ec7 100644 --- a/drivers/hwmon/powr1220.c +++ b/drivers/hwmon/powr1220.c @@ -323,7 +323,6 @@ static const struct i2c_device_id powr1220_ids[] = { MODULE_DEVICE_TABLE(i2c, powr1220_ids); static struct i2c_driver powr1220_driver = { - .class = I2C_CLASS_HWMON, .driver = { .name = "powr1220", }, diff --git a/drivers/hwmon/sbrmi.c b/drivers/hwmon/sbrmi.c index 484703f0ea5f88..4318f5121145e0 100644 --- a/drivers/hwmon/sbrmi.c +++ b/drivers/hwmon/sbrmi.c @@ -342,7 +342,6 @@ static const struct of_device_id __maybe_unused sbrmi_of_match[] = { MODULE_DEVICE_TABLE(of, sbrmi_of_match); static struct i2c_driver sbrmi_driver = { - .class = I2C_CLASS_HWMON, .driver = { .name = "sbrmi", .of_match_table = of_match_ptr(sbrmi_of_match), diff --git a/drivers/hwmon/sbtsi_temp.c b/drivers/hwmon/sbtsi_temp.c index dd85cf89f008a9..a4181acb1aa6b5 100644 --- a/drivers/hwmon/sbtsi_temp.c +++ b/drivers/hwmon/sbtsi_temp.c @@ -232,7 +232,6 @@ static const struct of_device_id __maybe_unused sbtsi_of_match[] = { MODULE_DEVICE_TABLE(of, sbtsi_of_match); static struct i2c_driver sbtsi_driver = { - .class = I2C_CLASS_HWMON, .driver = { .name = "sbtsi", .of_match_table = of_match_ptr(sbtsi_of_match), diff --git a/drivers/hwmon/w83773g.c b/drivers/hwmon/w83773g.c index 045eea8378c2da..401a28f55f931f 100644 --- a/drivers/hwmon/w83773g.c +++ b/drivers/hwmon/w83773g.c @@ -290,7 +290,6 @@ static int w83773_probe(struct i2c_client *client) } static struct i2c_driver w83773_driver = { - .class = I2C_CLASS_HWMON, .driver = { .name = "w83773g", .of_match_table = of_match_ptr(w83773_of_match), From 10e70ab10802e50ff6432964dc289d2bf93c2693 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Sat, 27 Jan 2024 07:40:58 -0800 Subject: [PATCH 250/707] MAINTAINERS: Drop entries for hwmon devices with unreachable maintainers Drop maintainer entries for MAX31760 and MAX31827 since the e-mail addresses of their maintainers is no longer reachable and there is no known alternative means to contact them. HWMON drivers have a subsystem maintainer, so individual maintainer entries are not mandatory. Reported-by: Heiner Kallweit Cc: Heiner Kallweit Signed-off-by: Guenter Roeck --- MAINTAINERS | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 8d1052fa6a6924..5e7239cb40ea6a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1384,15 +1384,6 @@ F: drivers/iio/amplifiers/hmc425a.c F: drivers/staging/iio/*/ad* X: drivers/iio/*/adjd* -ANALOG DEVICES INC MAX31760 DRIVER -M: Ibrahim Tilki -S: Maintained -W: http://wiki.analog.com/ -W: https://ez.analog.com/linux-software-drivers -F: Documentation/devicetree/bindings/hwmon/adi,max31760.yaml -F: Documentation/hwmon/max31760.rst -F: drivers/hwmon/max31760.c - ANALOGBITS PLL LIBRARIES M: Paul Walmsley S: Supported @@ -13130,15 +13121,6 @@ F: Documentation/userspace-api/media/drivers/max2175.rst F: drivers/media/i2c/max2175* F: include/uapi/linux/max2175.h -MAX31827 TEMPERATURE SWITCH DRIVER -M: Daniel Matyas -L: linux-hwmon@vger.kernel.org -S: Supported -W: https://ez.analog.com/linux-software-drivers -F: Documentation/devicetree/bindings/hwmon/adi,max31827.yaml -F: Documentation/hwmon/max31827.rst -F: drivers/hwmon/max31827.c - MAX31335 RTC DRIVER M: Antoniu Miclaus L: linux-rtc@vger.kernel.org From fb27638836ac2b4334bae9421b04563c1cd74a39 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Fri, 26 Jan 2024 13:04:33 +0100 Subject: [PATCH 251/707] pwm: atmel-hlcdc: Fix clock imbalance related to suspend support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The suspend callback disables the periph clock when the PWM is enabled and resume reenables this clock if the PWM was disabled before. Judging from the code comment it's suspend that is wrong here. Fix accordingly. Fixes: f9bb9da7c09d ("pwm: atmel-hlcdc: Implement the suspend/resume hooks") Link: https://lore.kernel.org/r/b51ea92b0a45eff3dc83b08adefd43d930df996c.1706269232.git.u.kleine-koenig@pengutronix.de Signed-off-by: Uwe Kleine-König --- drivers/pwm/pwm-atmel-hlcdc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pwm/pwm-atmel-hlcdc.c b/drivers/pwm/pwm-atmel-hlcdc.c index 3f2c5031a3ba85..1f6fc9a9fcf3ec 100644 --- a/drivers/pwm/pwm-atmel-hlcdc.c +++ b/drivers/pwm/pwm-atmel-hlcdc.c @@ -185,7 +185,7 @@ static int atmel_hlcdc_pwm_suspend(struct device *dev) struct atmel_hlcdc_pwm *atmel = dev_get_drvdata(dev); /* Keep the periph clock enabled if the PWM is still running. */ - if (pwm_is_enabled(&atmel->chip.pwms[0])) + if (!pwm_is_enabled(&atmel->chip.pwms[0])) clk_disable_unprepare(atmel->hlcdc->periph_clk); return 0; From 5043456b2de107ae65dcff7f49caf6c8f99169b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Tue, 9 Jan 2024 22:34:32 +0100 Subject: [PATCH 252/707] pwm: Drop useless member .of_pwm_n_cells of struct pwm_chip MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Apart from the two of_xlate implementations this member is write-only. In the of_xlate functions of_pwm_xlate_with_flags() and of_pwm_single_xlate() it's more sensible to check for args->args_count because this is what is actually used in the device tree. Acked-by: Douglas Anderson Link: https://lore.kernel.org/r/53d8c545aa8f79a920358be9e72e382b3981bdc4.1704835845.git.u.kleine-koenig@pengutronix.de Signed-off-by: Uwe Kleine-König --- drivers/gpu/drm/bridge/ti-sn65dsi86.c | 1 - drivers/pwm/core.c | 22 +++------------------- drivers/pwm/pwm-clps711x.c | 1 - drivers/pwm/pwm-cros-ec.c | 1 - drivers/pwm/pwm-pxa.c | 4 +--- include/linux/pwm.h | 2 -- 6 files changed, 4 insertions(+), 27 deletions(-) diff --git a/drivers/gpu/drm/bridge/ti-sn65dsi86.c b/drivers/gpu/drm/bridge/ti-sn65dsi86.c index 62cc3893dca5d3..1f6e929c2f6a3c 100644 --- a/drivers/gpu/drm/bridge/ti-sn65dsi86.c +++ b/drivers/gpu/drm/bridge/ti-sn65dsi86.c @@ -1591,7 +1591,6 @@ static int ti_sn_pwm_probe(struct auxiliary_device *adev, pdata->pchip.ops = &ti_sn_pwm_ops; pdata->pchip.npwm = 1; pdata->pchip.of_xlate = of_pwm_single_xlate; - pdata->pchip.of_pwm_n_cells = 1; devm_pm_runtime_enable(&adev->dev); diff --git a/drivers/pwm/core.c b/drivers/pwm/core.c index f2728ee787d7a5..31f210872a079f 100644 --- a/drivers/pwm/core.c +++ b/drivers/pwm/core.c @@ -107,9 +107,6 @@ of_pwm_xlate_with_flags(struct pwm_chip *chip, const struct of_phandle_args *arg { struct pwm_device *pwm; - if (chip->of_pwm_n_cells < 2) - return ERR_PTR(-EINVAL); - /* flags in the third cell are optional */ if (args->args_count < 2) return ERR_PTR(-EINVAL); @@ -124,10 +121,8 @@ of_pwm_xlate_with_flags(struct pwm_chip *chip, const struct of_phandle_args *arg pwm->args.period = args->args[1]; pwm->args.polarity = PWM_POLARITY_NORMAL; - if (chip->of_pwm_n_cells >= 3) { - if (args->args_count > 2 && args->args[2] & PWM_POLARITY_INVERTED) - pwm->args.polarity = PWM_POLARITY_INVERSED; - } + if (args->args_count > 2 && args->args[2] & PWM_POLARITY_INVERTED) + pwm->args.polarity = PWM_POLARITY_INVERSED; return pwm; } @@ -138,9 +133,6 @@ of_pwm_single_xlate(struct pwm_chip *chip, const struct of_phandle_args *args) { struct pwm_device *pwm; - if (chip->of_pwm_n_cells < 1) - return ERR_PTR(-EINVAL); - /* validate that one cell is specified, optionally with flags */ if (args->args_count != 1 && args->args_count != 2) return ERR_PTR(-EINVAL); @@ -164,16 +156,8 @@ static void of_pwmchip_add(struct pwm_chip *chip) if (!chip->dev || !chip->dev->of_node) return; - if (!chip->of_xlate) { - u32 pwm_cells; - - if (of_property_read_u32(chip->dev->of_node, "#pwm-cells", - &pwm_cells)) - pwm_cells = 2; - + if (!chip->of_xlate) chip->of_xlate = of_pwm_xlate_with_flags; - chip->of_pwm_n_cells = pwm_cells; - } of_node_get(chip->dev->of_node); } diff --git a/drivers/pwm/pwm-clps711x.c b/drivers/pwm/pwm-clps711x.c index 42179b3f7ec399..06562d4bb9633e 100644 --- a/drivers/pwm/pwm-clps711x.c +++ b/drivers/pwm/pwm-clps711x.c @@ -103,7 +103,6 @@ static int clps711x_pwm_probe(struct platform_device *pdev) priv->chip.dev = &pdev->dev; priv->chip.npwm = 2; priv->chip.of_xlate = clps711x_pwm_xlate; - priv->chip.of_pwm_n_cells = 1; spin_lock_init(&priv->lock); diff --git a/drivers/pwm/pwm-cros-ec.c b/drivers/pwm/pwm-cros-ec.c index 5fe303b8656def..339cedf3a7b18b 100644 --- a/drivers/pwm/pwm-cros-ec.c +++ b/drivers/pwm/pwm-cros-ec.c @@ -279,7 +279,6 @@ static int cros_ec_pwm_probe(struct platform_device *pdev) chip->dev = dev; chip->ops = &cros_ec_pwm_ops; chip->of_xlate = cros_ec_pwm_xlate; - chip->of_pwm_n_cells = 1; if (ec_pwm->use_pwm_type) { chip->npwm = CROS_EC_PWM_DT_COUNT; diff --git a/drivers/pwm/pwm-pxa.c b/drivers/pwm/pwm-pxa.c index 76685f926c7586..61b74fa1d3481a 100644 --- a/drivers/pwm/pwm-pxa.c +++ b/drivers/pwm/pwm-pxa.c @@ -180,10 +180,8 @@ static int pwm_probe(struct platform_device *pdev) pc->chip.ops = &pxa_pwm_ops; pc->chip.npwm = (id->driver_data & HAS_SECONDARY_PWM) ? 2 : 1; - if (IS_ENABLED(CONFIG_OF)) { + if (IS_ENABLED(CONFIG_OF)) pc->chip.of_xlate = of_pwm_single_xlate; - pc->chip.of_pwm_n_cells = 1; - } pc->mmio_base = devm_platform_ioremap_resource(pdev, 0); if (IS_ERR(pc->mmio_base)) diff --git a/include/linux/pwm.h b/include/linux/pwm.h index fcc2c4496f7316..8ffe9ae7a23a95 100644 --- a/include/linux/pwm.h +++ b/include/linux/pwm.h @@ -271,7 +271,6 @@ struct pwm_ops { * @id: unique number of this PWM chip * @npwm: number of PWMs controlled by this chip * @of_xlate: request a PWM device given a device tree PWM specifier - * @of_pwm_n_cells: number of cells expected in the device tree PWM specifier * @atomic: can the driver's ->apply() be called in atomic context * @pwms: array of PWM devices allocated by the framework */ @@ -284,7 +283,6 @@ struct pwm_chip { struct pwm_device * (*of_xlate)(struct pwm_chip *chip, const struct of_phandle_args *args); - unsigned int of_pwm_n_cells; bool atomic; /* only used internally by the PWM framework */ From 4f0f7dce06cd1cd1eb3cca11561634f5cd44b0b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Tue, 9 Jan 2024 22:34:33 +0100 Subject: [PATCH 253/707] pwm: Let the of_xlate callbacks accept references without period MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With this extension of_pwm_xlate_with_flags() is suitable to replace the custom xlate function of the pwm-clps711x driver. While touching these very similar functions align their implementations. Link: https://lore.kernel.org/r/127622315d07d9d419ae8e6373c7e5be7fab7a62.1704835845.git.u.kleine-koenig@pengutronix.de Signed-off-by: Uwe Kleine-König --- drivers/pwm/core.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/drivers/pwm/core.c b/drivers/pwm/core.c index 31f210872a079f..606d9ef0c70974 100644 --- a/drivers/pwm/core.c +++ b/drivers/pwm/core.c @@ -107,8 +107,8 @@ of_pwm_xlate_with_flags(struct pwm_chip *chip, const struct of_phandle_args *arg { struct pwm_device *pwm; - /* flags in the third cell are optional */ - if (args->args_count < 2) + /* period in the second cell and flags in the third cell are optional */ + if (args->args_count < 1) return ERR_PTR(-EINVAL); if (args->args[0] >= chip->npwm) @@ -118,9 +118,10 @@ of_pwm_xlate_with_flags(struct pwm_chip *chip, const struct of_phandle_args *arg if (IS_ERR(pwm)) return pwm; - pwm->args.period = args->args[1]; - pwm->args.polarity = PWM_POLARITY_NORMAL; + if (args->args_count > 1) + pwm->args.period = args->args[1]; + pwm->args.polarity = PWM_POLARITY_NORMAL; if (args->args_count > 2 && args->args[2] & PWM_POLARITY_INVERTED) pwm->args.polarity = PWM_POLARITY_INVERSED; @@ -133,18 +134,15 @@ of_pwm_single_xlate(struct pwm_chip *chip, const struct of_phandle_args *args) { struct pwm_device *pwm; - /* validate that one cell is specified, optionally with flags */ - if (args->args_count != 1 && args->args_count != 2) - return ERR_PTR(-EINVAL); - pwm = pwm_request_from_chip(chip, 0, NULL); if (IS_ERR(pwm)) return pwm; - pwm->args.period = args->args[0]; - pwm->args.polarity = PWM_POLARITY_NORMAL; + if (args->args_count > 1) + pwm->args.period = args->args[0]; - if (args->args_count == 2 && args->args[1] & PWM_POLARITY_INVERTED) + pwm->args.polarity = PWM_POLARITY_NORMAL; + if (args->args_count > 1 && args->args[1] & PWM_POLARITY_INVERTED) pwm->args.polarity = PWM_POLARITY_INVERSED; return pwm; From d1d29cd19653cd94e0f5aab152b46a539b1725e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Tue, 9 Jan 2024 22:34:34 +0100 Subject: [PATCH 254/707] pwm: clps711x: Drop custom .of_xlate() callback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The default of_xlate callback (of_pwm_xlate_with_flags()) does everything the drivers expects from its .of_xlate() callback. So drop the custom implementation. Link: https://lore.kernel.org/r/f58336c298d536107de5cab6a57e19f957ab326c.1704835845.git.u.kleine-koenig@pengutronix.de Signed-off-by: Uwe Kleine-König --- drivers/pwm/pwm-clps711x.c | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/drivers/pwm/pwm-clps711x.c b/drivers/pwm/pwm-clps711x.c index 06562d4bb9633e..f3b4af7963bede 100644 --- a/drivers/pwm/pwm-clps711x.c +++ b/drivers/pwm/pwm-clps711x.c @@ -74,15 +74,6 @@ static const struct pwm_ops clps711x_pwm_ops = { .apply = clps711x_pwm_apply, }; -static struct pwm_device *clps711x_pwm_xlate(struct pwm_chip *chip, - const struct of_phandle_args *args) -{ - if (args->args[0] >= chip->npwm) - return ERR_PTR(-EINVAL); - - return pwm_request_from_chip(chip, args->args[0], NULL); -} - static int clps711x_pwm_probe(struct platform_device *pdev) { struct clps711x_chip *priv; @@ -102,7 +93,6 @@ static int clps711x_pwm_probe(struct platform_device *pdev) priv->chip.ops = &clps711x_pwm_ops; priv->chip.dev = &pdev->dev; priv->chip.npwm = 2; - priv->chip.of_xlate = clps711x_pwm_xlate; spin_lock_init(&priv->lock); From 966ed5ea39e654bdefd2b3b7d9b91b5fa87ed84b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Tue, 9 Jan 2024 22:34:35 +0100 Subject: [PATCH 255/707] pwm: Drop duplicate check against chip->npwm in of_pwm_xlate_with_flags() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit args->args[0] is passed as parameter "index" to pwm_request_from_chip(). The latter function also checks for index >= npwm, so of_pwm_xlate_with_flags() doesn't need to do that. Link: https://lore.kernel.org/r/b06e445a6ed62a339add727eccb969a33d678386.1704835845.git.u.kleine-koenig@pengutronix.de Signed-off-by: Uwe Kleine-König --- drivers/pwm/core.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/pwm/core.c b/drivers/pwm/core.c index 606d9ef0c70974..b025d90e201c94 100644 --- a/drivers/pwm/core.c +++ b/drivers/pwm/core.c @@ -111,9 +111,6 @@ of_pwm_xlate_with_flags(struct pwm_chip *chip, const struct of_phandle_args *arg if (args->args_count < 1) return ERR_PTR(-EINVAL); - if (args->args[0] >= chip->npwm) - return ERR_PTR(-EINVAL); - pwm = pwm_request_from_chip(chip, args->args[0], NULL); if (IS_ERR(pwm)) return pwm; From c7005dd95719942b21dce6f02a94b6d412550714 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Thu, 25 Jan 2024 09:56:49 +0100 Subject: [PATCH 256/707] pwm: mediatek: Update kernel doc for struct pwm_mediatek_of_data MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The struct never had a member called clk_freq. This fixes the W=1 warning: drivers/pwm/pwm-mediatek.c:60: warning: Excess struct member 'clk_freq' description in 'pwm_mediatek_chip' Fixes: efecdeb82f21 ("pwm: mediatek: Allocate the clks array dynamically") Reviewed-by: AngeloGioacchino Del Regno Link: https://lore.kernel.org/r/20240125085649.1571268-2-u.kleine-koenig@pengutronix.de Signed-off-by: Uwe Kleine-König --- drivers/pwm/pwm-mediatek.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/pwm/pwm-mediatek.c b/drivers/pwm/pwm-mediatek.c index 17d290f847af60..562102a47ac044 100644 --- a/drivers/pwm/pwm-mediatek.c +++ b/drivers/pwm/pwm-mediatek.c @@ -47,7 +47,6 @@ struct pwm_mediatek_of_data { * @clk_top: the top clock generator * @clk_main: the clock used by PWM core * @clk_pwms: the clock used by each PWM channel - * @clk_freq: the fix clock frequency of legacy MIPS SoC * @soc: pointer to chip's platform data */ struct pwm_mediatek_chip { From 979c6fe7e799d2cab0a99c4b8c41cc48f10aca0c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Duje=20Mihanovi=C4=87?= Date: Sun, 7 Jan 2024 12:46:59 +0100 Subject: [PATCH 257/707] dt-bindings: pxa-pwm: Convert to YAML MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Convert the PXA PWM binding file from TXT to YAML. The original binding does not mention any clocks, but the PWM controller will not probe without a clock. Reviewed-by: Uwe Kleine-König Signed-off-by: Duje Mihanović Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20240107-pxa-pwm-yaml-v3-1-92ac90911c3f@skole.hr Signed-off-by: Uwe Kleine-König --- .../bindings/pwm/marvell,pxa-pwm.yaml | 51 +++++++++++++++++++ .../devicetree/bindings/pwm/pxa-pwm.txt | 30 ----------- 2 files changed, 51 insertions(+), 30 deletions(-) create mode 100644 Documentation/devicetree/bindings/pwm/marvell,pxa-pwm.yaml delete mode 100644 Documentation/devicetree/bindings/pwm/pxa-pwm.txt diff --git a/Documentation/devicetree/bindings/pwm/marvell,pxa-pwm.yaml b/Documentation/devicetree/bindings/pwm/marvell,pxa-pwm.yaml new file mode 100644 index 00000000000000..ba6325575ea040 --- /dev/null +++ b/Documentation/devicetree/bindings/pwm/marvell,pxa-pwm.yaml @@ -0,0 +1,51 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/pwm/marvell,pxa-pwm.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Marvell PXA PWM + +maintainers: + - Duje Mihanović + +allOf: + - $ref: pwm.yaml# + +properties: + compatible: + enum: + - marvell,pxa250-pwm + - marvell,pxa270-pwm + - marvell,pxa168-pwm + - marvell,pxa910-pwm + + reg: + # Length should be 0x10 + maxItems: 1 + + "#pwm-cells": + # Used for specifying the period length in nanoseconds + const: 1 + + clocks: + maxItems: 1 + +required: + - compatible + - reg + - "#pwm-cells" + - clocks + +additionalProperties: false + +examples: + - | + #include + + pwm0: pwm@40b00000 { + compatible = "marvell,pxa250-pwm"; + reg = <0x40b00000 0x10>; + #pwm-cells = <1>; + clocks = <&clks CLK_PWM0>; + }; diff --git a/Documentation/devicetree/bindings/pwm/pxa-pwm.txt b/Documentation/devicetree/bindings/pwm/pxa-pwm.txt deleted file mode 100644 index 5ae9f1e3c33896..00000000000000 --- a/Documentation/devicetree/bindings/pwm/pxa-pwm.txt +++ /dev/null @@ -1,30 +0,0 @@ -Marvell PWM controller - -Required properties: -- compatible: should be one or more of: - - "marvell,pxa250-pwm" - - "marvell,pxa270-pwm" - - "marvell,pxa168-pwm" - - "marvell,pxa910-pwm" -- reg: Physical base address and length of the registers used by the PWM channel - Note that one device instance must be created for each PWM that is used, so the - length covers only the register window for one PWM output, not that of the - entire PWM controller. Currently length is 0x10 for all supported devices. -- #pwm-cells: Should be 1. This cell is used to specify the period in - nanoseconds. - -Example PWM device node: - -pwm0: pwm@40b00000 { - compatible = "marvell,pxa250-pwm"; - reg = <0x40b00000 0x10>; - #pwm-cells = <1>; -}; - -Example PWM client node: - -backlight { - compatible = "pwm-backlight"; - pwms = <&pwm0 5000000>; - ... -} From 909d8d33f8b4664c9b6c7fd585114921af77fc2b Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sat, 27 Jan 2024 18:45:00 +0100 Subject: [PATCH 258/707] hwmon: Drop non-functional I2C_CLASS_HWMON support for drivers w/o detect() Class-based I2C probing requires detect() and address_list both to be set in the I2C client driver, see checks in i2c_detect(). It's misleading to declare I2C_CLASS_HWMON support if the driver doesn't implement detect(). Class-based probing is a legacy mechanism, in addition apparently nobody ever noticed that class-based probing has been non-functional in both drivers from the very beginning. So drop the fragments of class-based probing support. Signed-off-by: Heiner Kallweit Link: https://lore.kernel.org/r/13ce7c11-a958-4892-ada9-faf5bfdcb89d@gmail.com Signed-off-by: Guenter Roeck --- Documentation/hwmon/emc2305.rst | 1 - drivers/hwmon/adt7410.c | 2 -- drivers/hwmon/emc2305.c | 5 ----- 3 files changed, 8 deletions(-) diff --git a/Documentation/hwmon/emc2305.rst b/Documentation/hwmon/emc2305.rst index 2403dbaf272891..d0bfffe463587b 100644 --- a/Documentation/hwmon/emc2305.rst +++ b/Documentation/hwmon/emc2305.rst @@ -6,7 +6,6 @@ Kernel driver emc2305 Supported chips: Microchip EMC2305, EMC2303, EMC2302, EMC2301 - Addresses scanned: I2C 0x27, 0x2c, 0x2d, 0x2e, 0x2f, 0x4c, 0x4d Prefixes: 'emc2305' Datasheet: Publicly available at the Microchip website : diff --git a/drivers/hwmon/adt7410.c b/drivers/hwmon/adt7410.c index 95250677933650..fd214d9b3a895a 100644 --- a/drivers/hwmon/adt7410.c +++ b/drivers/hwmon/adt7410.c @@ -95,14 +95,12 @@ static const struct i2c_device_id adt7410_ids[] = { MODULE_DEVICE_TABLE(i2c, adt7410_ids); static struct i2c_driver adt7410_driver = { - .class = I2C_CLASS_HWMON, .driver = { .name = "adt7410", .pm = pm_sleep_ptr(&adt7x10_dev_pm_ops), }, .probe = adt7410_i2c_probe, .id_table = adt7410_ids, - .address_list = I2C_ADDRS(0x48, 0x49, 0x4a, 0x4b), }; module_i2c_driver(adt7410_driver); diff --git a/drivers/hwmon/emc2305.c b/drivers/hwmon/emc2305.c index 29f0e4945f1924..6ef733c0be1675 100644 --- a/drivers/hwmon/emc2305.c +++ b/drivers/hwmon/emc2305.c @@ -12,9 +12,6 @@ #include #include -static const unsigned short -emc2305_normal_i2c[] = { 0x27, 0x2c, 0x2d, 0x2e, 0x2f, 0x4c, 0x4d, I2C_CLIENT_END }; - #define EMC2305_REG_DRIVE_FAIL_STATUS 0x27 #define EMC2305_REG_VENDOR 0xfe #define EMC2305_FAN_MAX 0xff @@ -611,14 +608,12 @@ static void emc2305_remove(struct i2c_client *client) } static struct i2c_driver emc2305_driver = { - .class = I2C_CLASS_HWMON, .driver = { .name = "emc2305", }, .probe = emc2305_probe, .remove = emc2305_remove, .id_table = emc2305_ids, - .address_list = emc2305_normal_i2c, }; module_i2c_driver(emc2305_driver); From a16ab6713e2e4993e6a7152e848aee65b038cd1c Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 27 Jan 2024 14:24:07 +0100 Subject: [PATCH 259/707] pidfd: don't do_notify_pidfd() if !thread_group_empty() do_notify_pidfd() makes no sense until the whole thread group exits, change do_notify_parent() to check thread_group_empty(). This avoids the unnecessary do_notify_pidfd() when tsk is not a leader, or it exits before other threads, or it has a ptraced EXIT_ZOMBIE sub-thread. Signed-off-by: Oleg Nesterov Link: https://lore.kernel.org/r/20240127132407.GA29136@redhat.com Reviewed-by: Tycho Andersen Signed-off-by: Christian Brauner --- kernel/signal.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/kernel/signal.c b/kernel/signal.c index c9c57d053ce4f6..9561a3962ca687 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2050,9 +2050,11 @@ bool do_notify_parent(struct task_struct *tsk, int sig) WARN_ON_ONCE(!tsk->ptrace && (tsk->group_leader != tsk || !thread_group_empty(tsk))); - - /* Wake up all pidfd waiters */ - do_notify_pidfd(tsk); + /* + * tsk is a group leader and has no threads, wake up the pidfd waiters. + */ + if (thread_group_empty(tsk)) + do_notify_pidfd(tsk); if (sig != SIGCHLD) { /* From e0bad07869f7bf341152b37df89d30b9d2cfd7b6 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 26 Jan 2024 21:01:05 -0500 Subject: [PATCH 260/707] fs/pipe: Convert to lockdep_cmp_fn *_lock_nested() is fundamentally broken; lockdep needs to check lock ordering, but we cannot device a total ordering on an unbounded number of elements with only a few subclasses. the replacement is to define lock ordering with a proper comparison function. fs/pipe.c was already doing everything correctly otherwise, nothing much changes here. Cc: Alexander Viro Cc: Christian Brauner Cc: Jan Kara Signed-off-by: Kent Overstreet Link: https://lore.kernel.org/r/20240127020111.487218-2-kent.overstreet@linux.dev Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/pipe.c | 81 +++++++++++++++++++++++++------------------------------ 1 file changed, 36 insertions(+), 45 deletions(-) diff --git a/fs/pipe.c b/fs/pipe.c index f1adbfe743d4a7..50c8a8596b5245 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -76,18 +76,20 @@ static unsigned long pipe_user_pages_soft = PIPE_DEF_BUFFERS * INR_OPEN_CUR; * -- Manfred Spraul 2002-05-09 */ -static void pipe_lock_nested(struct pipe_inode_info *pipe, int subclass) +#define cmp_int(l, r) ((l > r) - (l < r)) + +#ifdef CONFIG_PROVE_LOCKING +static int pipe_lock_cmp_fn(const struct lockdep_map *a, + const struct lockdep_map *b) { - if (pipe->files) - mutex_lock_nested(&pipe->mutex, subclass); + return cmp_int((unsigned long) a, (unsigned long) b); } +#endif void pipe_lock(struct pipe_inode_info *pipe) { - /* - * pipe_lock() nests non-pipe inode locks (for writing to a file) - */ - pipe_lock_nested(pipe, I_MUTEX_PARENT); + if (pipe->files) + mutex_lock(&pipe->mutex); } EXPORT_SYMBOL(pipe_lock); @@ -98,28 +100,16 @@ void pipe_unlock(struct pipe_inode_info *pipe) } EXPORT_SYMBOL(pipe_unlock); -static inline void __pipe_lock(struct pipe_inode_info *pipe) -{ - mutex_lock_nested(&pipe->mutex, I_MUTEX_PARENT); -} - -static inline void __pipe_unlock(struct pipe_inode_info *pipe) -{ - mutex_unlock(&pipe->mutex); -} - void pipe_double_lock(struct pipe_inode_info *pipe1, struct pipe_inode_info *pipe2) { BUG_ON(pipe1 == pipe2); - if (pipe1 < pipe2) { - pipe_lock_nested(pipe1, I_MUTEX_PARENT); - pipe_lock_nested(pipe2, I_MUTEX_CHILD); - } else { - pipe_lock_nested(pipe2, I_MUTEX_PARENT); - pipe_lock_nested(pipe1, I_MUTEX_CHILD); - } + if (pipe1 > pipe2) + swap(pipe1, pipe2); + + pipe_lock(pipe1); + pipe_lock(pipe2); } static void anon_pipe_buf_release(struct pipe_inode_info *pipe, @@ -271,7 +261,7 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) return 0; ret = 0; - __pipe_lock(pipe); + mutex_lock(&pipe->mutex); /* * We only wake up writers if the pipe was full when we started @@ -368,7 +358,7 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) ret = -EAGAIN; break; } - __pipe_unlock(pipe); + mutex_unlock(&pipe->mutex); /* * We only get here if we didn't actually read anything. @@ -400,13 +390,13 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) if (wait_event_interruptible_exclusive(pipe->rd_wait, pipe_readable(pipe)) < 0) return -ERESTARTSYS; - __pipe_lock(pipe); + mutex_lock(&pipe->mutex); was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage); wake_next_reader = true; } if (pipe_empty(pipe->head, pipe->tail)) wake_next_reader = false; - __pipe_unlock(pipe); + mutex_unlock(&pipe->mutex); if (was_full) wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM); @@ -462,7 +452,7 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) if (unlikely(total_len == 0)) return 0; - __pipe_lock(pipe); + mutex_lock(&pipe->mutex); if (!pipe->readers) { send_sig(SIGPIPE, current, 0); @@ -582,19 +572,19 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) * after waiting we need to re-check whether the pipe * become empty while we dropped the lock. */ - __pipe_unlock(pipe); + mutex_unlock(&pipe->mutex); if (was_empty) wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM); kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); wait_event_interruptible_exclusive(pipe->wr_wait, pipe_writable(pipe)); - __pipe_lock(pipe); + mutex_lock(&pipe->mutex); was_empty = pipe_empty(pipe->head, pipe->tail); wake_next_writer = true; } out: if (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) wake_next_writer = false; - __pipe_unlock(pipe); + mutex_unlock(&pipe->mutex); /* * If we do do a wakeup event, we do a 'sync' wakeup, because we @@ -629,7 +619,7 @@ static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) switch (cmd) { case FIONREAD: - __pipe_lock(pipe); + mutex_lock(&pipe->mutex); count = 0; head = pipe->head; tail = pipe->tail; @@ -639,16 +629,16 @@ static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) count += pipe->bufs[tail & mask].len; tail++; } - __pipe_unlock(pipe); + mutex_unlock(&pipe->mutex); return put_user(count, (int __user *)arg); #ifdef CONFIG_WATCH_QUEUE case IOC_WATCH_QUEUE_SET_SIZE: { int ret; - __pipe_lock(pipe); + mutex_lock(&pipe->mutex); ret = watch_queue_set_size(pipe, arg); - __pipe_unlock(pipe); + mutex_unlock(&pipe->mutex); return ret; } @@ -734,7 +724,7 @@ pipe_release(struct inode *inode, struct file *file) { struct pipe_inode_info *pipe = file->private_data; - __pipe_lock(pipe); + mutex_lock(&pipe->mutex); if (file->f_mode & FMODE_READ) pipe->readers--; if (file->f_mode & FMODE_WRITE) @@ -747,7 +737,7 @@ pipe_release(struct inode *inode, struct file *file) kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); } - __pipe_unlock(pipe); + mutex_unlock(&pipe->mutex); put_pipe_info(inode, pipe); return 0; @@ -759,7 +749,7 @@ pipe_fasync(int fd, struct file *filp, int on) struct pipe_inode_info *pipe = filp->private_data; int retval = 0; - __pipe_lock(pipe); + mutex_lock(&pipe->mutex); if (filp->f_mode & FMODE_READ) retval = fasync_helper(fd, filp, on, &pipe->fasync_readers); if ((filp->f_mode & FMODE_WRITE) && retval >= 0) { @@ -768,7 +758,7 @@ pipe_fasync(int fd, struct file *filp, int on) /* this can happen only if on == T */ fasync_helper(-1, filp, 0, &pipe->fasync_readers); } - __pipe_unlock(pipe); + mutex_unlock(&pipe->mutex); return retval; } @@ -834,6 +824,7 @@ struct pipe_inode_info *alloc_pipe_info(void) pipe->nr_accounted = pipe_bufs; pipe->user = user; mutex_init(&pipe->mutex); + lock_set_cmp_fn(&pipe->mutex, pipe_lock_cmp_fn, NULL); return pipe; } @@ -1144,7 +1135,7 @@ static int fifo_open(struct inode *inode, struct file *filp) filp->private_data = pipe; /* OK, we have a pipe and it's pinned down */ - __pipe_lock(pipe); + mutex_lock(&pipe->mutex); /* We can only do regular read/write on fifos */ stream_open(inode, filp); @@ -1214,7 +1205,7 @@ static int fifo_open(struct inode *inode, struct file *filp) } /* Ok! */ - __pipe_unlock(pipe); + mutex_unlock(&pipe->mutex); return 0; err_rd: @@ -1230,7 +1221,7 @@ static int fifo_open(struct inode *inode, struct file *filp) goto err; err: - __pipe_unlock(pipe); + mutex_unlock(&pipe->mutex); put_pipe_info(inode, pipe); return ret; @@ -1411,7 +1402,7 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned int arg) if (!pipe) return -EBADF; - __pipe_lock(pipe); + mutex_lock(&pipe->mutex); switch (cmd) { case F_SETPIPE_SZ: @@ -1425,7 +1416,7 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned int arg) break; } - __pipe_unlock(pipe); + mutex_unlock(&pipe->mutex); return ret; } From e248feeb2519e4a00e44482f5045d1ae4ec7fa13 Mon Sep 17 00:00:00 2001 From: Zhipeng Lu Date: Sun, 24 Dec 2023 16:20:33 +0800 Subject: [PATCH 261/707] SUNRPC: fix a memleak in gss_import_v2_context The ctx->mech_used.data allocated by kmemdup is not freed in neither gss_import_v2_context nor it only caller gss_krb5_import_sec_context, which frees ctx on error. Thus, this patch reform the last call of gss_import_v2_context to the gss_krb5_import_ctx_v2, preventing the memleak while keepping the return formation. Fixes: 47d848077629 ("gss_krb5: handle new context format from gssd") Signed-off-by: Zhipeng Lu Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- net/sunrpc/auth_gss/gss_krb5_mech.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c index 64cff717c3d9b3..3366505bc669a0 100644 --- a/net/sunrpc/auth_gss/gss_krb5_mech.c +++ b/net/sunrpc/auth_gss/gss_krb5_mech.c @@ -398,6 +398,7 @@ gss_import_v2_context(const void *p, const void *end, struct krb5_ctx *ctx, u64 seq_send64; int keylen; u32 time32; + int ret; p = simple_get_bytes(p, end, &ctx->flags, sizeof(ctx->flags)); if (IS_ERR(p)) @@ -450,8 +451,16 @@ gss_import_v2_context(const void *p, const void *end, struct krb5_ctx *ctx, } ctx->mech_used.len = gss_kerberos_mech.gm_oid.len; - return gss_krb5_import_ctx_v2(ctx, gfp_mask); + ret = gss_krb5_import_ctx_v2(ctx, gfp_mask); + if (ret) { + p = ERR_PTR(ret); + goto out_free; + } + return 0; + +out_free: + kfree(ctx->mech_used.data); out_err: return PTR_ERR(p); } From b66d75139458ec91c4e1c0f33b2c7f676df24604 Mon Sep 17 00:00:00 2001 From: Zhipeng Lu Date: Tue, 2 Jan 2024 13:38:13 +0800 Subject: [PATCH 262/707] SUNRPC: fix some memleaks in gssx_dec_option_array The creds and oa->data need to be freed in the error-handling paths after their allocation. So this patch add these deallocations in the corresponding paths. Fixes: 1d658336b05f ("SUNRPC: Add RPC based upcall mechanism for RPCGSS auth") Signed-off-by: Zhipeng Lu Signed-off-by: Chuck Lever --- net/sunrpc/auth_gss/gss_rpc_xdr.c | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/net/sunrpc/auth_gss/gss_rpc_xdr.c b/net/sunrpc/auth_gss/gss_rpc_xdr.c index d79f12c2550ac3..cb32ab9a839521 100644 --- a/net/sunrpc/auth_gss/gss_rpc_xdr.c +++ b/net/sunrpc/auth_gss/gss_rpc_xdr.c @@ -250,8 +250,8 @@ static int gssx_dec_option_array(struct xdr_stream *xdr, creds = kzalloc(sizeof(struct svc_cred), GFP_KERNEL); if (!creds) { - kfree(oa->data); - return -ENOMEM; + err = -ENOMEM; + goto free_oa; } oa->data[0].option.data = CREDS_VALUE; @@ -265,29 +265,40 @@ static int gssx_dec_option_array(struct xdr_stream *xdr, /* option buffer */ p = xdr_inline_decode(xdr, 4); - if (unlikely(p == NULL)) - return -ENOSPC; + if (unlikely(p == NULL)) { + err = -ENOSPC; + goto free_creds; + } length = be32_to_cpup(p); p = xdr_inline_decode(xdr, length); - if (unlikely(p == NULL)) - return -ENOSPC; + if (unlikely(p == NULL)) { + err = -ENOSPC; + goto free_creds; + } if (length == sizeof(CREDS_VALUE) && memcmp(p, CREDS_VALUE, sizeof(CREDS_VALUE)) == 0) { /* We have creds here. parse them */ err = gssx_dec_linux_creds(xdr, creds); if (err) - return err; + goto free_creds; oa->data[0].value.len = 1; /* presence */ } else { /* consume uninteresting buffer */ err = gssx_dec_buffer(xdr, &dummy); if (err) - return err; + goto free_creds; } } return 0; + +free_creds: + kfree(creds); +free_oa: + kfree(oa->data); + oa->data = NULL; + return err; } static int gssx_dec_status(struct xdr_stream *xdr, From c7cb8c19bc73e6a6ee3dbabbc8299b8f469ee4e7 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 1 Jan 2024 11:37:45 -0500 Subject: [PATCH 263/707] SUNRPC: Use a static buffer for the checksum initialization vector Allocating and zeroing a buffer during every call to krb5_etm_checksum() is inefficient. Instead, set aside a static buffer that is the maximum crypto block size, and use a portion (or all) of that. Reported-by: Markus Elfring Signed-off-by: Chuck Lever --- net/sunrpc/auth_gss/gss_krb5_crypto.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/net/sunrpc/auth_gss/gss_krb5_crypto.c b/net/sunrpc/auth_gss/gss_krb5_crypto.c index d2b02710ab0709..b2c1b683a88ee2 100644 --- a/net/sunrpc/auth_gss/gss_krb5_crypto.c +++ b/net/sunrpc/auth_gss/gss_krb5_crypto.c @@ -921,6 +921,8 @@ gss_krb5_aes_decrypt(struct krb5_ctx *kctx, u32 offset, u32 len, * Caller provides the truncation length of the output token (h) in * cksumout.len. * + * Note that for RPCSEC, the "initial cipher state" is always all zeroes. + * * Return values: * %GSS_S_COMPLETE: Digest computed, @cksumout filled in * %GSS_S_FAILURE: Call failed @@ -931,22 +933,19 @@ u32 krb5_etm_checksum(struct crypto_sync_skcipher *cipher, int body_offset, struct xdr_netobj *cksumout) { unsigned int ivsize = crypto_sync_skcipher_ivsize(cipher); + static const u8 iv[GSS_KRB5_MAX_BLOCKSIZE]; struct ahash_request *req; struct scatterlist sg[1]; - u8 *iv, *checksumdata; int err = -ENOMEM; + u8 *checksumdata; checksumdata = kmalloc(crypto_ahash_digestsize(tfm), GFP_KERNEL); if (!checksumdata) return GSS_S_FAILURE; - /* For RPCSEC, the "initial cipher state" is always all zeroes. */ - iv = kzalloc(ivsize, GFP_KERNEL); - if (!iv) - goto out_free_mem; req = ahash_request_alloc(tfm, GFP_KERNEL); if (!req) - goto out_free_mem; + goto out_free_cksumdata; ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP, NULL, NULL); err = crypto_ahash_init(req); if (err) @@ -970,8 +969,7 @@ u32 krb5_etm_checksum(struct crypto_sync_skcipher *cipher, out_free_ahash: ahash_request_free(req); -out_free_mem: - kfree(iv); +out_free_cksumdata: kfree_sensitive(checksumdata); return err ? GSS_S_FAILURE : GSS_S_COMPLETE; } From 2df0218807f5846a235039410e44048bb676d0f4 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 15 Dec 2023 12:18:30 +1100 Subject: [PATCH 264/707] nfsd: Don't leave work of closing files to a work queue The work of closing a file can have non-trivial cost. Doing it in a separate work queue thread means that cost isn't imposed on the nfsd threads and an imbalance can be created. This can result in files being queued for the work queue more quickly that the work queue can process them, resulting in unbounded growth of the queue and memory exhaustion. To avoid this work imbalance that exhausts memory, this patch moves all closing of files into the nfsd threads. This means that when the work imposes a cost, that cost appears where it would be expected - in the work of the nfsd thread. A subsequent patch will ensure the final __fput() is called in the same (nfsd) thread which calls filp_close(). Files opened for NFSv3 are never explicitly closed by the client and are kept open by the server in the "filecache", which responds to memory pressure, is garbage collected even when there is no pressure, and sometimes closes files when there is particular need such as for rename. These files currently have filp_close() called in a dedicated work queue, so their __fput() can have no effect on nfsd threads. This patch discards the work queue and instead has each nfsd thread call flip_close() on as many as 8 files from the filecache each time it acts on a client request (or finds there are no pending client requests). If there are more to be closed, more threads are woken. This spreads the work of __fput() over multiple threads and imposes any cost on those threads. The number 8 is somewhat arbitrary. It needs to be greater than 1 to ensure that files are closed more quickly than they can be added to the cache. It needs to be small enough to limit the per-request delays that will be imposed on clients when all threads are busy closing files. Signed-off-by: NeilBrown Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/filecache.c | 67 +++++++++++++++++++++------------------------ fs/nfsd/filecache.h | 1 + fs/nfsd/nfssvc.c | 2 ++ 3 files changed, 34 insertions(+), 36 deletions(-) diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index 9cb7f0c33df587..f8b100bca6e4da 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -61,13 +61,10 @@ static DEFINE_PER_CPU(unsigned long, nfsd_file_total_age); static DEFINE_PER_CPU(unsigned long, nfsd_file_evictions); struct nfsd_fcache_disposal { - struct work_struct work; spinlock_t lock; struct list_head freeme; }; -static struct workqueue_struct *nfsd_filecache_wq __read_mostly; - static struct kmem_cache *nfsd_file_slab; static struct kmem_cache *nfsd_file_mark_slab; static struct list_lru nfsd_file_lru; @@ -421,7 +418,37 @@ nfsd_file_dispose_list_delayed(struct list_head *dispose) spin_lock(&l->lock); list_move_tail(&nf->nf_lru, &l->freeme); spin_unlock(&l->lock); - queue_work(nfsd_filecache_wq, &l->work); + svc_wake_up(nn->nfsd_serv); + } +} + +/** + * nfsd_file_net_dispose - deal with nfsd_files waiting to be disposed. + * @nn: nfsd_net in which to find files to be disposed. + * + * When files held open for nfsv3 are removed from the filecache, whether + * due to memory pressure or garbage collection, they are queued to + * a per-net-ns queue. This function completes the disposal, either + * directly or by waking another nfsd thread to help with the work. + */ +void nfsd_file_net_dispose(struct nfsd_net *nn) +{ + struct nfsd_fcache_disposal *l = nn->fcache_disposal; + + if (!list_empty(&l->freeme)) { + LIST_HEAD(dispose); + int i; + + spin_lock(&l->lock); + for (i = 0; i < 8 && !list_empty(&l->freeme); i++) + list_move(l->freeme.next, &dispose); + spin_unlock(&l->lock); + if (!list_empty(&l->freeme)) + /* Wake up another thread to share the work + * *before* doing any actual disposing. + */ + svc_wake_up(nn->nfsd_serv); + nfsd_file_dispose_list(&dispose); } } @@ -634,27 +661,6 @@ nfsd_file_close_inode_sync(struct inode *inode) flush_delayed_fput(); } -/** - * nfsd_file_delayed_close - close unused nfsd_files - * @work: dummy - * - * Scrape the freeme list for this nfsd_net, and then dispose of them - * all. - */ -static void -nfsd_file_delayed_close(struct work_struct *work) -{ - LIST_HEAD(head); - struct nfsd_fcache_disposal *l = container_of(work, - struct nfsd_fcache_disposal, work); - - spin_lock(&l->lock); - list_splice_init(&l->freeme, &head); - spin_unlock(&l->lock); - - nfsd_file_dispose_list(&head); -} - static int nfsd_file_lease_notifier_call(struct notifier_block *nb, unsigned long arg, void *data) @@ -717,10 +723,6 @@ nfsd_file_cache_init(void) return ret; ret = -ENOMEM; - nfsd_filecache_wq = alloc_workqueue("nfsd_filecache", WQ_UNBOUND, 0); - if (!nfsd_filecache_wq) - goto out; - nfsd_file_slab = kmem_cache_create("nfsd_file", sizeof(struct nfsd_file), 0, 0, NULL); if (!nfsd_file_slab) { @@ -735,7 +737,6 @@ nfsd_file_cache_init(void) goto out_err; } - ret = list_lru_init(&nfsd_file_lru); if (ret) { pr_err("nfsd: failed to init nfsd_file_lru: %d\n", ret); @@ -785,8 +786,6 @@ nfsd_file_cache_init(void) nfsd_file_slab = NULL; kmem_cache_destroy(nfsd_file_mark_slab); nfsd_file_mark_slab = NULL; - destroy_workqueue(nfsd_filecache_wq); - nfsd_filecache_wq = NULL; rhltable_destroy(&nfsd_file_rhltable); goto out; } @@ -832,7 +831,6 @@ nfsd_alloc_fcache_disposal(void) l = kmalloc(sizeof(*l), GFP_KERNEL); if (!l) return NULL; - INIT_WORK(&l->work, nfsd_file_delayed_close); spin_lock_init(&l->lock); INIT_LIST_HEAD(&l->freeme); return l; @@ -841,7 +839,6 @@ nfsd_alloc_fcache_disposal(void) static void nfsd_free_fcache_disposal(struct nfsd_fcache_disposal *l) { - cancel_work_sync(&l->work); nfsd_file_dispose_list(&l->freeme); kfree(l); } @@ -910,8 +907,6 @@ nfsd_file_cache_shutdown(void) fsnotify_wait_marks_destroyed(); kmem_cache_destroy(nfsd_file_mark_slab); nfsd_file_mark_slab = NULL; - destroy_workqueue(nfsd_filecache_wq); - nfsd_filecache_wq = NULL; rhltable_destroy(&nfsd_file_rhltable); for_each_possible_cpu(i) { diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h index e54165a3224f0b..c61884def906d0 100644 --- a/fs/nfsd/filecache.h +++ b/fs/nfsd/filecache.h @@ -56,6 +56,7 @@ void nfsd_file_cache_shutdown_net(struct net *net); void nfsd_file_put(struct nfsd_file *nf); struct nfsd_file *nfsd_file_get(struct nfsd_file *nf); void nfsd_file_close_inode_sync(struct inode *inode); +void nfsd_file_net_dispose(struct nfsd_net *nn); bool nfsd_file_is_cached(struct inode *inode); __be32 nfsd_file_acquire_gc(struct svc_rqst *rqstp, struct svc_fh *fhp, unsigned int may_flags, struct nfsd_file **nfp); diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index a667802e08e75f..9a894c3511baf3 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -941,6 +941,8 @@ nfsd(void *vrqstp) rqstp->rq_server->sv_maxconn = nn->max_connections; svc_recv(rqstp); + + nfsd_file_net_dispose(nn); } atomic_dec(&nfsdstats.th_cnt); From fc6bfd4bace624be26652cbd5efa1deb2d4c3e0c Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 15 Dec 2023 12:18:31 +1100 Subject: [PATCH 265/707] nfsd: use __fput_sync() to avoid delayed closing of files. Calling fput() directly or though filp_close() from a kernel thread like nfsd causes the final __fput() (if necessary) to be called from a workqueue. This means that nfsd is not forced to wait for any work to complete. If the ->release or ->destroy_inode function is slow for any reason, this can result in nfsd closing files more quickly than the workqueue can complete the close and the queue of pending closes can grow without bounces (30 million has been seen at one customer site, though this was in part due to a slowness in xfs which has since been fixed). nfsd does not need this. It is quite appropriate and safe for nfsd to do its own close work. There is no reason that close should ever wait for nfsd, so no deadlock can occur. It should be safe and sensible to change all fput() calls to __fput_sync(). However in the interests of caution this patch only changes two - the two that can be most directly affected by client behaviour and could occur at high frequency. - the fput() implicitly in flip_close() is changed to __fput_sync() by calling get_file() first to ensure filp_close() doesn't do the final fput() itself. If is where files opened for IO are closed. - the fput() in nfsd_read() is also changed. This is where directories opened for readdir are closed. This ensure that minimal fput work is queued to the workqueue. This removes the need for the flush_delayed_fput() call in nfsd_file_close_inode_sync() Signed-off-by: NeilBrown Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/filecache.c | 3 +-- fs/nfsd/vfs.c | 42 +++++++++++++++++++++++++++++++++++++----- fs/nfsd/vfs.h | 2 ++ 3 files changed, 40 insertions(+), 7 deletions(-) diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index f8b100bca6e4da..8d9f7b07e35b39 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -280,7 +280,7 @@ nfsd_file_free(struct nfsd_file *nf) nfsd_file_mark_put(nf->nf_mark); if (nf->nf_file) { nfsd_file_check_write_error(nf); - filp_close(nf->nf_file, NULL); + nfsd_filp_close(nf->nf_file); } /* @@ -658,7 +658,6 @@ nfsd_file_close_inode_sync(struct inode *inode) list_del_init(&nf->nf_lru); nfsd_file_free(nf); } - flush_delayed_fput(); } static int diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index b7c7a9273ea01d..f57749cd6f0b1a 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1906,10 +1906,10 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, fh_drop_write(ffhp); /* - * If the target dentry has cached open files, then we need to try to - * close them prior to doing the rename. Flushing delayed fput - * shouldn't be done with locks held however, so we delay it until this - * point and then reattempt the whole shebang. + * If the target dentry has cached open files, then we need to + * try to close them prior to doing the rename. Final fput + * shouldn't be done with locks held however, so we delay it + * until this point and then reattempt the whole shebang. */ if (close_cached) { close_cached = false; @@ -2177,11 +2177,43 @@ nfsd_readdir(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t *offsetp, if (err == nfserr_eof || err == nfserr_toosmall) err = nfs_ok; /* can still be found in ->err */ out_close: - fput(file); + nfsd_filp_close(file); out: return err; } +/** + * nfsd_filp_close: close a file synchronously + * @fp: the file to close + * + * nfsd_filp_close() is similar in behaviour to filp_close(). + * The difference is that if this is the final close on the + * file, the that finalisation happens immediately, rather then + * being handed over to a work_queue, as it the case for + * filp_close(). + * When a user-space process closes a file (even when using + * filp_close() the finalisation happens before returning to + * userspace, so it is effectively synchronous. When a kernel thread + * uses file_close(), on the other hand, the handling is completely + * asynchronous. This means that any cost imposed by that finalisation + * is not imposed on the nfsd thread, and nfsd could potentually + * close files more quickly than the work queue finalises the close, + * which would lead to unbounded growth in the queue. + * + * In some contexts is it not safe to synchronously wait for + * close finalisation (see comment for __fput_sync()), but nfsd + * does not match those contexts. In partcilarly it does not, at the + * time that this function is called, hold and locks and no finalisation + * of any file, socket, or device driver would have any cause to wait + * for nfsd to make progress. + */ +void nfsd_filp_close(struct file *fp) +{ + get_file(fp); + filp_close(fp, NULL); + __fput_sync(fp); +} + /* * Get file system stats * N.B. After this call fhp needs an fh_put diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index 702fbc4483bf16..1efa4e8dfb0349 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -148,6 +148,8 @@ __be32 nfsd_statfs(struct svc_rqst *, struct svc_fh *, __be32 nfsd_permission(struct svc_rqst *, struct svc_export *, struct dentry *, int); +void nfsd_filp_close(struct file *fp); + static inline int fh_want_write(struct svc_fh *fh) { int ret; From 26efb5e8c0b15f4f5da1046a3e3ea1e51c9f71ee Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 17 Jan 2024 14:48:04 +1100 Subject: [PATCH 266/707] nfsd: drop st_mutex and rp_mutex before calling move_to_close_lru() move_to_close_lru() is currently called with ->st_mutex and .rp_mutex held. This can lead to a deadlock as move_to_close_lru() waits for sc_count to drop to 2, and some threads holding a reference might be waiting for either mutex. These references will never be dropped so sc_count will never reach 2. There can be no harm in dropping ->st_mutex to before move_to_close_lru() because the only place that takes the mutex is nfsd4_lock_ol_stateid(), and it quickly aborts if sc_type is NFS4_CLOSED_STID, which it will be before move_to_close_lru() is called. Similarly dropping .rp_mutex is safe after the state is closed and so no longer usable. Another way to look at this is that nothing significant happens between when nfsd4_close() now calls nfsd4_cstate_clear_replay(), and where nfsd4_proc_compound calls nfsd4_cstate_clear_replay() a little later. See also https://lore.kernel.org/lkml/4dd1fe21e11344e5969bb112e954affb@jd.com/T/ where this problem was raised but not successfully resolved. Signed-off-by: NeilBrown Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/nfs4state.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 6dc6340e28529d..d7d561b29fb0d9 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -6991,7 +6991,7 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp, return status; } -static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s) +static bool nfsd4_close_open_stateid(struct nfs4_ol_stateid *s) { struct nfs4_client *clp = s->st_stid.sc_client; bool unhashed; @@ -7008,11 +7008,11 @@ static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s) list_for_each_entry(stp, &reaplist, st_locks) nfs4_free_cpntf_statelist(clp->net, &stp->st_stid); free_ol_stateid_reaplist(&reaplist); + return false; } else { spin_unlock(&clp->cl_lock); free_ol_stateid_reaplist(&reaplist); - if (unhashed) - move_to_close_lru(s, clp->net); + return unhashed; } } @@ -7028,6 +7028,7 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfs4_ol_stateid *stp; struct net *net = SVC_NET(rqstp); struct nfsd_net *nn = net_generic(net, nfsd_net_id); + bool need_move_to_close_list; dprintk("NFSD: nfsd4_close on file %pd\n", cstate->current_fh.fh_dentry); @@ -7050,8 +7051,17 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, */ nfs4_inc_and_copy_stateid(&close->cl_stateid, &stp->st_stid); - nfsd4_close_open_stateid(stp); + need_move_to_close_list = nfsd4_close_open_stateid(stp); mutex_unlock(&stp->st_mutex); + if (need_move_to_close_list) { + /* Drop the replay mutex early as move_to_close_lru() + * can wait for other threads which hold that mutex. + * This call is idempotent, so that fact that it will + * be called twice is harmless. + */ + nfsd4_cstate_clear_replay(cstate); + move_to_close_lru(stp, net); + } /* v4.1+ suggests that we send a special stateid in here, since the * clients should just ignore this anyway. Since this is not useful From 135d5626e96952a2d09abdea5622e689aff2c4a6 Mon Sep 17 00:00:00 2001 From: Jorge Mora Date: Thu, 25 Jan 2024 07:46:54 -0700 Subject: [PATCH 267/707] NFSD: fix nfsd4_listxattr_validate_cookie If LISTXATTRS is sent with a correct cookie but a small maxcount, this could lead function nfsd4_listxattr_validate_cookie to return NFS4ERR_BAD_COOKIE. If maxcount = 20, then second check on function gives RHS = 3 thus any cookie larger than 3 returns NFS4ERR_BAD_COOKIE. There is no need to validate the cookie on the return XDR buffer since attribute referenced by cookie will be the first in the return buffer. Fixes: 23e50fe3a5e6 ("nfsd: implement the xattr functions and en/decode logic") Signed-off-by: Jorge Mora Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/nfs4xdr.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index c719c475a068ef..f0be0d6fe63fd2 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -5386,16 +5386,11 @@ nfsd4_listxattr_validate_cookie(struct nfsd4_listxattrs *listxattrs, /* * If the cookie is larger than the maximum number we can fit - * in either the buffer we just got back from vfs_listxattr, or, - * XDR-encoded, in the return buffer, it's invalid. + * in the buffer we just got back from vfs_listxattr, it's invalid. */ if (cookie > (listxattrs->lsxa_len) / (XATTR_USER_PREFIX_LEN + 2)) return nfserr_badcookie; - if (cookie > (listxattrs->lsxa_maxcount / - (XDR_QUADLEN(XATTR_USER_PREFIX_LEN + 2) + 4))) - return nfserr_badcookie; - *offsetp = (u32)cookie; return 0; } From 42ec6645cb8f43a4062f8c62bbd13a43fa73351b Mon Sep 17 00:00:00 2001 From: Jorge Mora Date: Thu, 25 Jan 2024 07:46:12 -0700 Subject: [PATCH 268/707] NFSD: change LISTXATTRS cookie encoding to big-endian Function nfsd4_listxattr_validate_cookie() expects the cookie as an offset to the list thus it needs to be encoded in big-endian. Fixes: 23e50fe3a5e6 ("nfsd: implement the xattr functions and en/decode logic") Signed-off-by: Jorge Mora Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/nfs4xdr.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index f0be0d6fe63fd2..5649076df4b4f7 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -5407,6 +5407,7 @@ nfsd4_encode_listxattrs(struct nfsd4_compoundres *resp, __be32 nfserr, u64 cookie; char *sp; __be32 status, tmp; + __be64 wire_cookie; __be32 *p; u32 nuser; @@ -5498,7 +5499,8 @@ nfsd4_encode_listxattrs(struct nfsd4_compoundres *resp, __be32 nfserr, cookie = offset + count; - write_bytes_to_xdr_buf(xdr->buf, cookie_offset, &cookie, 8); + wire_cookie = cpu_to_be64(cookie); + write_bytes_to_xdr_buf(xdr->buf, cookie_offset, &wire_cookie, 8); tmp = cpu_to_be32(count); write_bytes_to_xdr_buf(xdr->buf, count_offset, &tmp, 4); out: From 1d6bbeca66aaff9a1698f7fbee243cb379012364 Mon Sep 17 00:00:00 2001 From: Jorge Mora Date: Thu, 25 Jan 2024 07:45:28 -0700 Subject: [PATCH 269/707] NFSD: fix LISTXATTRS returning a short list with eof=TRUE If the XDR buffer is not large enough to fit all attributes and the remaining bytes left in the XDR buffer (xdrleft) is equal to the number of bytes for the current attribute, then the loop will prematurely exit without setting eof to FALSE. Also in this case, adding the eof flag to the buffer will make the reply 4 bytes larger than lsxa_maxcount. Need to check if there are enough bytes to fit not only the next attribute name but also the eof as well. Fixes: 23e50fe3a5e6 ("nfsd: implement the xattr functions and en/decode logic") Signed-off-by: Jorge Mora Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/nfs4xdr.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 5649076df4b4f7..840ecd7eaf0713 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -5447,7 +5447,8 @@ nfsd4_encode_listxattrs(struct nfsd4_compoundres *resp, __be32 nfserr, slen -= XATTR_USER_PREFIX_LEN; xdrlen = 4 + ((slen + 3) & ~3); - if (xdrlen > xdrleft) { + /* Check if both entry and eof can fit in the XDR buffer */ + if (xdrlen + XDR_UNIT > xdrleft) { if (count == 0) { /* * Can't even fit the first attribute name. From 3713a52d5a7df24ab62ba6d9a0a72b1dae473a6e Mon Sep 17 00:00:00 2001 From: Jorge Mora Date: Thu, 25 Jan 2024 07:42:23 -0700 Subject: [PATCH 270/707] NFSD: fix LISTXATTRS returning more bytes than maxcount The maxcount is the maximum number of bytes for the LISTXATTRS4resok result. This includes the cookie and the count for the name array, thus subtract 12 bytes from the maxcount: 8 (cookie) + 4 (array count) when filling up the name array. Fixes: 23e50fe3a5e6 ("nfsd: implement the xattr functions and en/decode logic") Signed-off-by: Jorge Mora Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/nfs4xdr.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 840ecd7eaf0713..e3f761cd5ee78d 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -5423,7 +5423,7 @@ nfsd4_encode_listxattrs(struct nfsd4_compoundres *resp, __be32 nfserr, */ cookie_offset = xdr->buf->len; count_offset = cookie_offset + 8; - p = xdr_reserve_space(xdr, 12); + p = xdr_reserve_space(xdr, XDR_UNIT * 3); if (!p) { status = nfserr_resource; goto out; @@ -5434,7 +5434,8 @@ nfsd4_encode_listxattrs(struct nfsd4_compoundres *resp, __be32 nfserr, sp = listxattrs->lsxa_buf; nuser = 0; - xdrleft = listxattrs->lsxa_maxcount; + /* Bytes left is maxcount - 8 (cookie) - 4 (array count) */ + xdrleft = listxattrs->lsxa_maxcount - XDR_UNIT * 3; while (left > 0 && xdrleft > 0) { slen = strlen(sp); From c1365ef33ad700b0a1fcdc65110e6c94d274f348 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 26 Jan 2024 10:39:40 -0500 Subject: [PATCH 271/707] sunrpc: don't change ->sv_stats if it doesn't exist We check for the existence of ->sv_stats elsewhere except in the core processing code. It appears that only nfsd actual exports these values anywhere, everybody else just has a write only copy of sv_stats in their svc_program. Add a check for ->sv_stats before every adjustment to allow us to eliminate the stats struct from all the users who don't report the stats. Signed-off-by: Josef Bacik Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- net/sunrpc/svc.c | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index f60c93e5a25d69..d2e6f3d5921801 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -1375,7 +1375,8 @@ svc_process_common(struct svc_rqst *rqstp) goto err_bad_proc; /* Syntactic check complete */ - serv->sv_stats->rpccnt++; + if (serv->sv_stats) + serv->sv_stats->rpccnt++; trace_svc_process(rqstp, progp->pg_name); aoffset = xdr_stream_pos(xdr); @@ -1427,7 +1428,8 @@ svc_process_common(struct svc_rqst *rqstp) goto close_xprt; err_bad_rpc: - serv->sv_stats->rpcbadfmt++; + if (serv->sv_stats) + serv->sv_stats->rpcbadfmt++; xdr_stream_encode_u32(xdr, RPC_MSG_DENIED); xdr_stream_encode_u32(xdr, RPC_MISMATCH); /* Only RPCv2 supported */ @@ -1438,7 +1440,8 @@ svc_process_common(struct svc_rqst *rqstp) err_bad_auth: dprintk("svc: authentication failed (%d)\n", be32_to_cpu(rqstp->rq_auth_stat)); - serv->sv_stats->rpcbadauth++; + if (serv->sv_stats) + serv->sv_stats->rpcbadauth++; /* Restore write pointer to location of reply status: */ xdr_truncate_encode(xdr, XDR_UNIT * 2); xdr_stream_encode_u32(xdr, RPC_MSG_DENIED); @@ -1448,7 +1451,8 @@ svc_process_common(struct svc_rqst *rqstp) err_bad_prog: dprintk("svc: unknown program %d\n", rqstp->rq_prog); - serv->sv_stats->rpcbadfmt++; + if (serv->sv_stats) + serv->sv_stats->rpcbadfmt++; *rqstp->rq_accept_statp = rpc_prog_unavail; goto sendit; @@ -1456,7 +1460,8 @@ svc_process_common(struct svc_rqst *rqstp) svc_printk(rqstp, "unknown version (%d for prog %d, %s)\n", rqstp->rq_vers, rqstp->rq_prog, progp->pg_name); - serv->sv_stats->rpcbadfmt++; + if (serv->sv_stats) + serv->sv_stats->rpcbadfmt++; *rqstp->rq_accept_statp = rpc_prog_mismatch; /* @@ -1470,19 +1475,22 @@ svc_process_common(struct svc_rqst *rqstp) err_bad_proc: svc_printk(rqstp, "unknown procedure (%d)\n", rqstp->rq_proc); - serv->sv_stats->rpcbadfmt++; + if (serv->sv_stats) + serv->sv_stats->rpcbadfmt++; *rqstp->rq_accept_statp = rpc_proc_unavail; goto sendit; err_garbage_args: svc_printk(rqstp, "failed to decode RPC header\n"); - serv->sv_stats->rpcbadfmt++; + if (serv->sv_stats) + serv->sv_stats->rpcbadfmt++; *rqstp->rq_accept_statp = rpc_garbage_args; goto sendit; err_system_err: - serv->sv_stats->rpcbadfmt++; + if (serv->sv_stats) + serv->sv_stats->rpcbadfmt++; *rqstp->rq_accept_statp = rpc_system_err; goto sendit; } @@ -1534,7 +1542,8 @@ void svc_process(struct svc_rqst *rqstp) out_baddir: svc_printk(rqstp, "bad direction 0x%08x, dropping request\n", be32_to_cpu(*p)); - rqstp->rq_server->sv_stats->rpcbadfmt++; + if (rqstp->rq_server->sv_stats) + rqstp->rq_server->sv_stats->rpcbadfmt++; out_drop: svc_drop(rqstp); } From 93af2fc6e6589e225e27e96e96604fb1352c8366 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 26 Jan 2024 10:39:41 -0500 Subject: [PATCH 272/707] nfsd: stop setting ->pg_stats for unused stats A lot of places are setting a blank svc_stats in ->pg_stats and never utilizing these stats. Remove all of these extra structs as we're not reporting these stats anywhere. Signed-off-by: Josef Bacik Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/lockd/svc.c | 3 --- fs/nfs/callback.c | 3 --- fs/nfsd/nfssvc.c | 5 ----- 3 files changed, 11 deletions(-) diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index ce5862482097a1..ab8042a5b895bc 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -710,8 +710,6 @@ static const struct svc_version *nlmsvc_version[] = { #endif }; -static struct svc_stat nlmsvc_stats; - #define NLM_NRVERS ARRAY_SIZE(nlmsvc_version) static struct svc_program nlmsvc_program = { .pg_prog = NLM_PROGRAM, /* program number */ @@ -719,7 +717,6 @@ static struct svc_program nlmsvc_program = { .pg_vers = nlmsvc_version, /* version table */ .pg_name = "lockd", /* service name */ .pg_class = "nfsd", /* share authentication with nfsd */ - .pg_stats = &nlmsvc_stats, /* stats table */ .pg_authenticate = &lockd_authenticate, /* export authentication */ .pg_init_request = svc_generic_init_request, .pg_rpcbind_set = svc_generic_rpcbind_set, diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 760d27dd7225e9..8adfcd4c8c1a0a 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -356,15 +356,12 @@ static const struct svc_version *nfs4_callback_version[] = { [4] = &nfs4_callback_version4, }; -static struct svc_stat nfs4_callback_stats; - static struct svc_program nfs4_callback_program = { .pg_prog = NFS4_CALLBACK, /* RPC service number */ .pg_nvers = ARRAY_SIZE(nfs4_callback_version), /* Number of entries */ .pg_vers = nfs4_callback_version, /* version table */ .pg_name = "NFSv4 callback", /* service name */ .pg_class = "nfs", /* authentication class */ - .pg_stats = &nfs4_callback_stats, .pg_authenticate = nfs_callback_authenticate, .pg_init_request = svc_generic_init_request, .pg_rpcbind_set = svc_generic_rpcbind_set, diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 9a894c3511baf3..a0b117107e8605 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -80,7 +80,6 @@ unsigned long nfsd_drc_max_mem; unsigned long nfsd_drc_mem_used; #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) -static struct svc_stat nfsd_acl_svcstats; static const struct svc_version *nfsd_acl_version[] = { # if defined(CONFIG_NFSD_V2_ACL) [2] = &nfsd_acl_version2, @@ -99,15 +98,11 @@ static struct svc_program nfsd_acl_program = { .pg_vers = nfsd_acl_version, .pg_name = "nfsacl", .pg_class = "nfsd", - .pg_stats = &nfsd_acl_svcstats, .pg_authenticate = &svc_set_client, .pg_init_request = nfsd_acl_init_request, .pg_rpcbind_set = nfsd_acl_rpcbind_set, }; -static struct svc_stat nfsd_acl_svcstats = { - .program = &nfsd_acl_program, -}; #endif /* defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) */ static const struct svc_version *nfsd_version[] = { From 6b32f5f1e4c2e77d3e8a811cc40a2b5d8d11dd2b Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 26 Jan 2024 10:39:42 -0500 Subject: [PATCH 273/707] sunrpc: pass in the sv_stats struct through svc_create_pooled Since only one service actually reports the rpc stats there's not much of a reason to have a pointer to it in the svc_program struct. Adjust the svc_create_pooled function to take the sv_stats as an argument and pass the struct through there as desired instead of getting it from the svc_program->pg_stats. Signed-off-by: Josef Bacik Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/nfssvc.c | 3 ++- include/linux/sunrpc/svc.h | 4 +++- net/sunrpc/svc.c | 12 +++++++----- 3 files changed, 12 insertions(+), 7 deletions(-) diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index a0b117107e8605..d640f893021a71 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -661,7 +661,8 @@ int nfsd_create_serv(struct net *net) if (nfsd_max_blksize == 0) nfsd_max_blksize = nfsd_get_default_max_blksize(); nfsd_reset_versions(nn); - serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize, nfsd); + serv = svc_create_pooled(&nfsd_program, &nfsd_svcstats, + nfsd_max_blksize, nfsd); if (serv == NULL) return -ENOMEM; diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 67cf1c9efd809b..91a653eb3a5073 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -411,7 +411,9 @@ bool svc_rqst_replace_page(struct svc_rqst *rqstp, void svc_rqst_release_pages(struct svc_rqst *rqstp); void svc_rqst_free(struct svc_rqst *); void svc_exit_thread(struct svc_rqst *); -struct svc_serv * svc_create_pooled(struct svc_program *, unsigned int, +struct svc_serv * svc_create_pooled(struct svc_program *prog, + struct svc_stat *stats, + unsigned int bufsize, int (*threadfn)(void *data)); int svc_set_num_threads(struct svc_serv *, struct svc_pool *, int); int svc_pool_stats_open(struct svc_info *si, struct file *file); diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index d2e6f3d5921801..9bd8a868c1a70a 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -451,8 +451,8 @@ __svc_init_bc(struct svc_serv *serv) * Create an RPC service */ static struct svc_serv * -__svc_create(struct svc_program *prog, unsigned int bufsize, int npools, - int (*threadfn)(void *data)) +__svc_create(struct svc_program *prog, struct svc_stat *stats, + unsigned int bufsize, int npools, int (*threadfn)(void *data)) { struct svc_serv *serv; unsigned int vers; @@ -463,7 +463,7 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools, return NULL; serv->sv_name = prog->pg_name; serv->sv_program = prog; - serv->sv_stats = prog->pg_stats; + serv->sv_stats = stats; if (bufsize > RPCSVC_MAXPAYLOAD) bufsize = RPCSVC_MAXPAYLOAD; serv->sv_max_payload = bufsize? bufsize : 4096; @@ -529,26 +529,28 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools, struct svc_serv *svc_create(struct svc_program *prog, unsigned int bufsize, int (*threadfn)(void *data)) { - return __svc_create(prog, bufsize, 1, threadfn); + return __svc_create(prog, NULL, bufsize, 1, threadfn); } EXPORT_SYMBOL_GPL(svc_create); /** * svc_create_pooled - Create an RPC service with pooled threads * @prog: the RPC program the new service will handle + * @stats: the stats struct if desired * @bufsize: maximum message size for @prog * @threadfn: a function to service RPC requests for @prog * * Returns an instantiated struct svc_serv object or NULL. */ struct svc_serv *svc_create_pooled(struct svc_program *prog, + struct svc_stat *stats, unsigned int bufsize, int (*threadfn)(void *data)) { struct svc_serv *serv; unsigned int npools = svc_pool_map_get(); - serv = __svc_create(prog, bufsize, npools, threadfn); + serv = __svc_create(prog, stats, bufsize, npools, threadfn); if (!serv) goto out_err; return serv; From 2896caafbc9efc3188ee0e38440efd29130fdd94 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 26 Jan 2024 10:39:43 -0500 Subject: [PATCH 274/707] sunrpc: remove ->pg_stats from svc_program Now that this isn't used anywhere, remove it. Signed-off-by: Josef Bacik Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/nfssvc.c | 1 - include/linux/sunrpc/svc.h | 1 - 2 files changed, 2 deletions(-) diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index d640f893021a71..d98a6abad99010 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -127,7 +127,6 @@ struct svc_program nfsd_program = { .pg_vers = nfsd_version, /* version table */ .pg_name = "nfsd", /* program name */ .pg_class = "nfsd", /* authentication class */ - .pg_stats = &nfsd_svcstats, /* version table */ .pg_authenticate = &svc_set_client, /* export authentication */ .pg_init_request = nfsd_init_request, .pg_rpcbind_set = nfsd_rpcbind_set, diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 91a653eb3a5073..23617da0e565e7 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -339,7 +339,6 @@ struct svc_program { const struct svc_version **pg_vers; /* version array */ char * pg_name; /* service name */ char * pg_class; /* class name: services sharing authentication */ - struct svc_stat * pg_stats; /* rpc statistics */ enum svc_auth_status (*pg_authenticate)(struct svc_rqst *rqstp); __be32 (*pg_init_request)(struct svc_rqst *, const struct svc_program *, From b3b72309149a69c7e16911fbbf77dde058f75491 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 26 Jan 2024 10:39:44 -0500 Subject: [PATCH 275/707] sunrpc: use the struct net as the svc proc private nfsd is the only thing using this helper, and it doesn't use the private currently. When we switch to per-network namespace stats we will need the struct net * in order to get to the nfsd_net. Use the net as the proc private so we can utilize this when we make the switch over. Signed-off-by: Josef Bacik Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- net/sunrpc/stats.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sunrpc/stats.c b/net/sunrpc/stats.c index 65fc1297c6dfa4..383860cb1d5b0f 100644 --- a/net/sunrpc/stats.c +++ b/net/sunrpc/stats.c @@ -314,7 +314,7 @@ EXPORT_SYMBOL_GPL(rpc_proc_unregister); struct proc_dir_entry * svc_proc_register(struct net *net, struct svc_stat *statp, const struct proc_ops *proc_ops) { - return do_register(net, statp->program->pg_name, statp, proc_ops); + return do_register(net, statp->program->pg_name, net, proc_ops); } EXPORT_SYMBOL_GPL(svc_proc_register); From a96efc8859539ce5da7e2a7cd0dd177d41b1810f Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 26 Jan 2024 10:39:45 -0500 Subject: [PATCH 276/707] nfsd: rename NFSD_NET_* to NFSD_STATS_* We're going to merge the stats all into per network namespace in subsequent patches, rename these nn counters to be consistent with the rest of the stats. Signed-off-by: Josef Bacik Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/netns.h | 4 ++-- fs/nfsd/nfscache.c | 4 ++-- fs/nfsd/stats.h | 6 +++--- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index 74b4360779a112..e3605cb5f044d8 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -26,9 +26,9 @@ struct nfsd4_client_tracking_ops; enum { /* cache misses due only to checksum comparison failures */ - NFSD_NET_PAYLOAD_MISSES, + NFSD_STATS_PAYLOAD_MISSES, /* amount of memory (in bytes) currently consumed by the DRC */ - NFSD_NET_DRC_MEM_USAGE, + NFSD_STATS_DRC_MEM_USAGE, NFSD_NET_COUNTERS_NUM }; diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index 5c1a4a0aa60568..3d4a9d181c43e2 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -687,7 +687,7 @@ int nfsd_reply_cache_stats_show(struct seq_file *m, void *v) atomic_read(&nn->num_drc_entries)); seq_printf(m, "hash buckets: %u\n", 1 << nn->maskbits); seq_printf(m, "mem usage: %lld\n", - percpu_counter_sum_positive(&nn->counter[NFSD_NET_DRC_MEM_USAGE])); + percpu_counter_sum_positive(&nn->counter[NFSD_STATS_DRC_MEM_USAGE])); seq_printf(m, "cache hits: %lld\n", percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_RC_HITS])); seq_printf(m, "cache misses: %lld\n", @@ -695,7 +695,7 @@ int nfsd_reply_cache_stats_show(struct seq_file *m, void *v) seq_printf(m, "not cached: %lld\n", percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_RC_NOCACHE])); seq_printf(m, "payload misses: %lld\n", - percpu_counter_sum_positive(&nn->counter[NFSD_NET_PAYLOAD_MISSES])); + percpu_counter_sum_positive(&nn->counter[NFSD_STATS_PAYLOAD_MISSES])); seq_printf(m, "longest chain len: %u\n", nn->longest_chain); seq_printf(m, "cachesize at longest: %u\n", nn->longest_chain_cachesize); return 0; diff --git a/fs/nfsd/stats.h b/fs/nfsd/stats.h index 14f50c660b619e..7ed4325ac69123 100644 --- a/fs/nfsd/stats.h +++ b/fs/nfsd/stats.h @@ -81,17 +81,17 @@ static inline void nfsd_stats_io_write_add(struct svc_export *exp, s64 amount) static inline void nfsd_stats_payload_misses_inc(struct nfsd_net *nn) { - percpu_counter_inc(&nn->counter[NFSD_NET_PAYLOAD_MISSES]); + percpu_counter_inc(&nn->counter[NFSD_STATS_PAYLOAD_MISSES]); } static inline void nfsd_stats_drc_mem_usage_add(struct nfsd_net *nn, s64 amount) { - percpu_counter_add(&nn->counter[NFSD_NET_DRC_MEM_USAGE], amount); + percpu_counter_add(&nn->counter[NFSD_STATS_DRC_MEM_USAGE], amount); } static inline void nfsd_stats_drc_mem_usage_sub(struct nfsd_net *nn, s64 amount) { - percpu_counter_sub(&nn->counter[NFSD_NET_DRC_MEM_USAGE], amount); + percpu_counter_sub(&nn->counter[NFSD_STATS_DRC_MEM_USAGE], amount); } #ifdef CONFIG_NFSD_V4 From bdf92cbe7097b5cccf4e3e9ae6619c0f315346f7 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 26 Jan 2024 10:39:46 -0500 Subject: [PATCH 277/707] nfsd: expose /proc/net/sunrpc/nfsd in net namespaces We are running nfsd servers inside of containers with their own network namespace, and we want to monitor these services using the stats found in /proc. However these are not exposed in the proc inside of the container, so we have to bind mount the host /proc into our containers to get at this information. Separate out the stat counters init and the proc registration, and move the proc registration into the pernet operations entry and exit points so that these stats can be exposed inside of network namespaces. This is an intermediate step, this just exposes the global counters in the network namespace. Subsequent patches will move these counters into the per-network namespace container. Signed-off-by: Josef Bacik Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/nfsctl.c | 8 +++++--- fs/nfsd/stats.c | 21 ++++++--------------- fs/nfsd/stats.h | 6 ++++-- 3 files changed, 15 insertions(+), 20 deletions(-) diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index f206ca32e7f53c..b57480b50e350c 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1679,6 +1679,7 @@ static __net_init int nfsd_net_init(struct net *net) nfsd4_init_leases_net(nn); get_random_bytes(&nn->siphash_key, sizeof(nn->siphash_key)); seqlock_init(&nn->writeverf_lock); + nfsd_proc_stat_init(net); return 0; @@ -1699,6 +1700,7 @@ static __net_exit void nfsd_net_exit(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); + nfsd_proc_stat_shutdown(net); nfsd_net_reply_cache_destroy(nn); nfsd_idmap_shutdown(net); nfsd_export_shutdown(net); @@ -1722,7 +1724,7 @@ static int __init init_nfsd(void) retval = nfsd4_init_pnfs(); if (retval) goto out_free_slabs; - retval = nfsd_stat_init(); /* Statistics */ + retval = nfsd_stat_counters_init(); /* Statistics */ if (retval) goto out_free_pnfs; retval = nfsd_drc_slab_create(); @@ -1762,7 +1764,7 @@ static int __init init_nfsd(void) nfsd_lockd_shutdown(); nfsd_drc_slab_free(); out_free_stat: - nfsd_stat_shutdown(); + nfsd_stat_counters_destroy(); out_free_pnfs: nfsd4_exit_pnfs(); out_free_slabs: @@ -1780,7 +1782,7 @@ static void __exit exit_nfsd(void) nfsd_drc_slab_free(); remove_proc_entry("fs/nfs/exports", NULL); remove_proc_entry("fs/nfs", NULL); - nfsd_stat_shutdown(); + nfsd_stat_counters_destroy(); nfsd_lockd_shutdown(); nfsd4_free_slabs(); nfsd4_exit_pnfs(); diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c index 12d79f5d4eb1ac..394a65a33942d7 100644 --- a/fs/nfsd/stats.c +++ b/fs/nfsd/stats.c @@ -108,31 +108,22 @@ void nfsd_percpu_counters_destroy(struct percpu_counter counters[], int num) percpu_counter_destroy(&counters[i]); } -static int nfsd_stat_counters_init(void) +int nfsd_stat_counters_init(void) { return nfsd_percpu_counters_init(nfsdstats.counter, NFSD_STATS_COUNTERS_NUM); } -static void nfsd_stat_counters_destroy(void) +void nfsd_stat_counters_destroy(void) { nfsd_percpu_counters_destroy(nfsdstats.counter, NFSD_STATS_COUNTERS_NUM); } -int nfsd_stat_init(void) +void nfsd_proc_stat_init(struct net *net) { - int err; - - err = nfsd_stat_counters_init(); - if (err) - return err; - - svc_proc_register(&init_net, &nfsd_svcstats, &nfsd_proc_ops); - - return 0; + svc_proc_register(net, &nfsd_svcstats, &nfsd_proc_ops); } -void nfsd_stat_shutdown(void) +void nfsd_proc_stat_shutdown(struct net *net) { - nfsd_stat_counters_destroy(); - svc_proc_unregister(&init_net, "nfsd"); + svc_proc_unregister(net, "nfsd"); } diff --git a/fs/nfsd/stats.h b/fs/nfsd/stats.h index 7ed4325ac69123..38811aa7d13e1e 100644 --- a/fs/nfsd/stats.h +++ b/fs/nfsd/stats.h @@ -40,8 +40,10 @@ extern struct svc_stat nfsd_svcstats; int nfsd_percpu_counters_init(struct percpu_counter *counters, int num); void nfsd_percpu_counters_reset(struct percpu_counter *counters, int num); void nfsd_percpu_counters_destroy(struct percpu_counter *counters, int num); -int nfsd_stat_init(void); -void nfsd_stat_shutdown(void); +int nfsd_stat_counters_init(void); +void nfsd_stat_counters_destroy(void); +void nfsd_proc_stat_init(struct net *net); +void nfsd_proc_stat_shutdown(struct net *net); static inline void nfsd_stats_rc_hits_inc(void) { From fce33795b2c3284637713515f4aaf6970942aa8c Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 26 Jan 2024 10:39:47 -0500 Subject: [PATCH 278/707] nfsd: make all of the nfsd stats per-network namespace We have a global set of counters that we modify for all of the nfsd operations, but now that we're exposing these stats across all network namespaces we need to make the stats also be per-network namespace. We already have some caching stats that are per-network namespace, so move these definitions into the same counter and then adjust all the helpers and users of these stats to provide the appropriate nfsd_net struct so that the stats are maintained for the per-network namespace objects. Signed-off-by: Josef Bacik Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/cache.h | 2 -- fs/nfsd/netns.h | 17 ++++++++++++-- fs/nfsd/nfs4proc.c | 6 ++--- fs/nfsd/nfs4state.c | 3 ++- fs/nfsd/nfscache.c | 36 ++++++------------------------ fs/nfsd/nfsctl.c | 12 +++------- fs/nfsd/nfsfh.c | 3 ++- fs/nfsd/stats.c | 26 ++++++++++++---------- fs/nfsd/stats.h | 54 ++++++++++++++++----------------------------- fs/nfsd/vfs.c | 6 +++-- 10 files changed, 69 insertions(+), 96 deletions(-) diff --git a/fs/nfsd/cache.h b/fs/nfsd/cache.h index 4cbe0434cbb8ce..66a05fefae98ea 100644 --- a/fs/nfsd/cache.h +++ b/fs/nfsd/cache.h @@ -80,8 +80,6 @@ enum { int nfsd_drc_slab_create(void); void nfsd_drc_slab_free(void); -int nfsd_net_reply_cache_init(struct nfsd_net *nn); -void nfsd_net_reply_cache_destroy(struct nfsd_net *nn); int nfsd_reply_cache_init(struct nfsd_net *); void nfsd_reply_cache_shutdown(struct nfsd_net *); int nfsd_cache_lookup(struct svc_rqst *rqstp, unsigned int start, diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index e3605cb5f044d8..0cef4bb407a9c6 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -29,7 +30,19 @@ enum { NFSD_STATS_PAYLOAD_MISSES, /* amount of memory (in bytes) currently consumed by the DRC */ NFSD_STATS_DRC_MEM_USAGE, - NFSD_NET_COUNTERS_NUM + NFSD_STATS_RC_HITS, /* repcache hits */ + NFSD_STATS_RC_MISSES, /* repcache misses */ + NFSD_STATS_RC_NOCACHE, /* uncached reqs */ + NFSD_STATS_FH_STALE, /* FH stale error */ + NFSD_STATS_IO_READ, /* bytes returned to read requests */ + NFSD_STATS_IO_WRITE, /* bytes passed in write requests */ +#ifdef CONFIG_NFSD_V4 + NFSD_STATS_FIRST_NFS4_OP, /* count of individual nfsv4 operations */ + NFSD_STATS_LAST_NFS4_OP = NFSD_STATS_FIRST_NFS4_OP + LAST_NFS4_OP, +#define NFSD_STATS_NFS4_OP(op) (NFSD_STATS_FIRST_NFS4_OP + (op)) + NFSD_STATS_WDELEG_GETATTR, /* count of getattr conflict with wdeleg */ +#endif + NFSD_STATS_COUNTERS_NUM }; /* @@ -164,7 +177,7 @@ struct nfsd_net { atomic_t num_drc_entries; /* Per-netns stats counters */ - struct percpu_counter counter[NFSD_NET_COUNTERS_NUM]; + struct percpu_counter counter[NFSD_STATS_COUNTERS_NUM]; /* longest hash chain seen */ unsigned int longest_chain; diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 14712fa08f769e..648ff427005e6c 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -2490,10 +2490,10 @@ nfsd4_proc_null(struct svc_rqst *rqstp) return rpc_success; } -static inline void nfsd4_increment_op_stats(u32 opnum) +static inline void nfsd4_increment_op_stats(struct nfsd_net *nn, u32 opnum) { if (opnum >= FIRST_NFS4_OP && opnum <= LAST_NFS4_OP) - percpu_counter_inc(&nfsdstats.counter[NFSD_STATS_NFS4_OP(opnum)]); + percpu_counter_inc(&nn->counter[NFSD_STATS_NFS4_OP(opnum)]); } static const struct nfsd4_operation nfsd4_ops[]; @@ -2768,7 +2768,7 @@ nfsd4_proc_compound(struct svc_rqst *rqstp) status, nfsd4_op_name(op->opnum)); nfsd4_cstate_clear_replay(cstate); - nfsd4_increment_op_stats(op->opnum); + nfsd4_increment_op_stats(nn, op->opnum); } fh_put(current_fh); diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index d7d561b29fb0d9..82f456bc42a4cf 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -8461,6 +8461,7 @@ __be32 nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct inode *inode) { __be32 status; + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); struct file_lock_context *ctx; struct file_lock *fl; struct nfs4_delegation *dp; @@ -8490,7 +8491,7 @@ nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct inode *inode) } break_lease: spin_unlock(&ctx->flc_lock); - nfsd_stats_wdeleg_getattr_inc(); + nfsd_stats_wdeleg_getattr_inc(nn); status = nfserrno(nfsd_open_break_lease(inode, NFSD_MAY_READ)); if (status != nfserr_jukebox || !nfsd_wait_for_delegreturn(rqstp, inode)) diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index 3d4a9d181c43e2..cfcc6ac8f255a8 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -176,27 +176,6 @@ void nfsd_drc_slab_free(void) kmem_cache_destroy(drc_slab); } -/** - * nfsd_net_reply_cache_init - per net namespace reply cache set-up - * @nn: nfsd_net being initialized - * - * Returns zero on succes; otherwise a negative errno is returned. - */ -int nfsd_net_reply_cache_init(struct nfsd_net *nn) -{ - return nfsd_percpu_counters_init(nn->counter, NFSD_NET_COUNTERS_NUM); -} - -/** - * nfsd_net_reply_cache_destroy - per net namespace reply cache tear-down - * @nn: nfsd_net being freed - * - */ -void nfsd_net_reply_cache_destroy(struct nfsd_net *nn) -{ - nfsd_percpu_counters_destroy(nn->counter, NFSD_NET_COUNTERS_NUM); -} - int nfsd_reply_cache_init(struct nfsd_net *nn) { unsigned int hashsize; @@ -501,7 +480,7 @@ nfsd_cache_insert(struct nfsd_drc_bucket *b, struct nfsd_cacherep *key, int nfsd_cache_lookup(struct svc_rqst *rqstp, unsigned int start, unsigned int len, struct nfsd_cacherep **cacherep) { - struct nfsd_net *nn; + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); struct nfsd_cacherep *rp, *found; __wsum csum; struct nfsd_drc_bucket *b; @@ -510,7 +489,7 @@ int nfsd_cache_lookup(struct svc_rqst *rqstp, unsigned int start, int rtn = RC_DOIT; if (type == RC_NOCACHE) { - nfsd_stats_rc_nocache_inc(); + nfsd_stats_rc_nocache_inc(nn); goto out; } @@ -520,7 +499,6 @@ int nfsd_cache_lookup(struct svc_rqst *rqstp, unsigned int start, * Since the common case is a cache miss followed by an insert, * preallocate an entry. */ - nn = net_generic(SVC_NET(rqstp), nfsd_net_id); rp = nfsd_cacherep_alloc(rqstp, csum, nn); if (!rp) goto out; @@ -537,7 +515,7 @@ int nfsd_cache_lookup(struct svc_rqst *rqstp, unsigned int start, nfsd_cacherep_dispose(&dispose); - nfsd_stats_rc_misses_inc(); + nfsd_stats_rc_misses_inc(nn); atomic_inc(&nn->num_drc_entries); nfsd_stats_drc_mem_usage_add(nn, sizeof(*rp)); goto out; @@ -545,7 +523,7 @@ int nfsd_cache_lookup(struct svc_rqst *rqstp, unsigned int start, found_entry: /* We found a matching entry which is either in progress or done. */ nfsd_reply_cache_free_locked(NULL, rp, nn); - nfsd_stats_rc_hits_inc(); + nfsd_stats_rc_hits_inc(nn); rtn = RC_DROPIT; rp = found; @@ -689,11 +667,11 @@ int nfsd_reply_cache_stats_show(struct seq_file *m, void *v) seq_printf(m, "mem usage: %lld\n", percpu_counter_sum_positive(&nn->counter[NFSD_STATS_DRC_MEM_USAGE])); seq_printf(m, "cache hits: %lld\n", - percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_RC_HITS])); + percpu_counter_sum_positive(&nn->counter[NFSD_STATS_RC_HITS])); seq_printf(m, "cache misses: %lld\n", - percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_RC_MISSES])); + percpu_counter_sum_positive(&nn->counter[NFSD_STATS_RC_MISSES])); seq_printf(m, "not cached: %lld\n", - percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_RC_NOCACHE])); + percpu_counter_sum_positive(&nn->counter[NFSD_STATS_RC_NOCACHE])); seq_printf(m, "payload misses: %lld\n", percpu_counter_sum_positive(&nn->counter[NFSD_STATS_PAYLOAD_MISSES])); seq_printf(m, "longest chain len: %u\n", nn->longest_chain); diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index b57480b50e350c..ea3c8114245c28 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1671,7 +1671,7 @@ static __net_init int nfsd_net_init(struct net *net) retval = nfsd_idmap_init(net); if (retval) goto out_idmap_error; - retval = nfsd_net_reply_cache_init(nn); + retval = nfsd_stat_counters_init(nn); if (retval) goto out_repcache_error; nn->nfsd_versions = NULL; @@ -1701,7 +1701,7 @@ static __net_exit void nfsd_net_exit(struct net *net) struct nfsd_net *nn = net_generic(net, nfsd_net_id); nfsd_proc_stat_shutdown(net); - nfsd_net_reply_cache_destroy(nn); + nfsd_stat_counters_destroy(nn); nfsd_idmap_shutdown(net); nfsd_export_shutdown(net); nfsd_netns_free_versions(nn); @@ -1724,12 +1724,9 @@ static int __init init_nfsd(void) retval = nfsd4_init_pnfs(); if (retval) goto out_free_slabs; - retval = nfsd_stat_counters_init(); /* Statistics */ - if (retval) - goto out_free_pnfs; retval = nfsd_drc_slab_create(); if (retval) - goto out_free_stat; + goto out_free_pnfs; nfsd_lockd_init(); /* lockd->nfsd callbacks */ retval = create_proc_exports_entry(); if (retval) @@ -1763,8 +1760,6 @@ static int __init init_nfsd(void) out_free_lockd: nfsd_lockd_shutdown(); nfsd_drc_slab_free(); -out_free_stat: - nfsd_stat_counters_destroy(); out_free_pnfs: nfsd4_exit_pnfs(); out_free_slabs: @@ -1782,7 +1777,6 @@ static void __exit exit_nfsd(void) nfsd_drc_slab_free(); remove_proc_entry("fs/nfs/exports", NULL); remove_proc_entry("fs/nfs", NULL); - nfsd_stat_counters_destroy(); nfsd_lockd_shutdown(); nfsd4_free_slabs(); nfsd4_exit_pnfs(); diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index dbfa0ac13564ac..40fecf7b224f2f 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -327,6 +327,7 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp) __be32 fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, int access) { + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); struct svc_export *exp = NULL; struct dentry *dentry; __be32 error; @@ -395,7 +396,7 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, int access) out: trace_nfsd_fh_verify_err(rqstp, fhp, type, access, error); if (error == nfserr_stale) - nfsd_stats_fh_stale_inc(exp); + nfsd_stats_fh_stale_inc(nn, exp); return error; } diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c index 394a65a33942d7..44e275324b06e5 100644 --- a/fs/nfsd/stats.c +++ b/fs/nfsd/stats.c @@ -34,15 +34,17 @@ struct svc_stat nfsd_svcstats = { static int nfsd_show(struct seq_file *seq, void *v) { + struct net *net = pde_data(file_inode(seq->file)); + struct nfsd_net *nn = net_generic(net, nfsd_net_id); int i; seq_printf(seq, "rc %lld %lld %lld\nfh %lld 0 0 0 0\nio %lld %lld\n", - percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_RC_HITS]), - percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_RC_MISSES]), - percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_RC_NOCACHE]), - percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_FH_STALE]), - percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_IO_READ]), - percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_IO_WRITE])); + percpu_counter_sum_positive(&nn->counter[NFSD_STATS_RC_HITS]), + percpu_counter_sum_positive(&nn->counter[NFSD_STATS_RC_MISSES]), + percpu_counter_sum_positive(&nn->counter[NFSD_STATS_RC_NOCACHE]), + percpu_counter_sum_positive(&nn->counter[NFSD_STATS_FH_STALE]), + percpu_counter_sum_positive(&nn->counter[NFSD_STATS_IO_READ]), + percpu_counter_sum_positive(&nn->counter[NFSD_STATS_IO_WRITE])); /* thread usage: */ seq_printf(seq, "th %u 0", atomic_read(&nfsdstats.th_cnt)); @@ -63,10 +65,10 @@ static int nfsd_show(struct seq_file *seq, void *v) seq_printf(seq, "proc4ops %u", LAST_NFS4_OP + 1); for (i = 0; i <= LAST_NFS4_OP; i++) { seq_printf(seq, " %lld", - percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_NFS4_OP(i)])); + percpu_counter_sum_positive(&nn->counter[NFSD_STATS_NFS4_OP(i)])); } seq_printf(seq, "\nwdeleg_getattr %lld", - percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_WDELEG_GETATTR])); + percpu_counter_sum_positive(&nn->counter[NFSD_STATS_WDELEG_GETATTR])); seq_putc(seq, '\n'); #endif @@ -108,14 +110,14 @@ void nfsd_percpu_counters_destroy(struct percpu_counter counters[], int num) percpu_counter_destroy(&counters[i]); } -int nfsd_stat_counters_init(void) +int nfsd_stat_counters_init(struct nfsd_net *nn) { - return nfsd_percpu_counters_init(nfsdstats.counter, NFSD_STATS_COUNTERS_NUM); + return nfsd_percpu_counters_init(nn->counter, NFSD_STATS_COUNTERS_NUM); } -void nfsd_stat_counters_destroy(void) +void nfsd_stat_counters_destroy(struct nfsd_net *nn) { - nfsd_percpu_counters_destroy(nfsdstats.counter, NFSD_STATS_COUNTERS_NUM); + nfsd_percpu_counters_destroy(nn->counter, NFSD_STATS_COUNTERS_NUM); } void nfsd_proc_stat_init(struct net *net) diff --git a/fs/nfsd/stats.h b/fs/nfsd/stats.h index 38811aa7d13e1e..c24be4ddbe7d70 100644 --- a/fs/nfsd/stats.h +++ b/fs/nfsd/stats.h @@ -10,26 +10,7 @@ #include #include - -enum { - NFSD_STATS_RC_HITS, /* repcache hits */ - NFSD_STATS_RC_MISSES, /* repcache misses */ - NFSD_STATS_RC_NOCACHE, /* uncached reqs */ - NFSD_STATS_FH_STALE, /* FH stale error */ - NFSD_STATS_IO_READ, /* bytes returned to read requests */ - NFSD_STATS_IO_WRITE, /* bytes passed in write requests */ -#ifdef CONFIG_NFSD_V4 - NFSD_STATS_FIRST_NFS4_OP, /* count of individual nfsv4 operations */ - NFSD_STATS_LAST_NFS4_OP = NFSD_STATS_FIRST_NFS4_OP + LAST_NFS4_OP, -#define NFSD_STATS_NFS4_OP(op) (NFSD_STATS_FIRST_NFS4_OP + (op)) - NFSD_STATS_WDELEG_GETATTR, /* count of getattr conflict with wdeleg */ -#endif - NFSD_STATS_COUNTERS_NUM -}; - struct nfsd_stats { - struct percpu_counter counter[NFSD_STATS_COUNTERS_NUM]; - atomic_t th_cnt; /* number of available threads */ }; @@ -40,43 +21,46 @@ extern struct svc_stat nfsd_svcstats; int nfsd_percpu_counters_init(struct percpu_counter *counters, int num); void nfsd_percpu_counters_reset(struct percpu_counter *counters, int num); void nfsd_percpu_counters_destroy(struct percpu_counter *counters, int num); -int nfsd_stat_counters_init(void); -void nfsd_stat_counters_destroy(void); +int nfsd_stat_counters_init(struct nfsd_net *nn); +void nfsd_stat_counters_destroy(struct nfsd_net *nn); void nfsd_proc_stat_init(struct net *net); void nfsd_proc_stat_shutdown(struct net *net); -static inline void nfsd_stats_rc_hits_inc(void) +static inline void nfsd_stats_rc_hits_inc(struct nfsd_net *nn) { - percpu_counter_inc(&nfsdstats.counter[NFSD_STATS_RC_HITS]); + percpu_counter_inc(&nn->counter[NFSD_STATS_RC_HITS]); } -static inline void nfsd_stats_rc_misses_inc(void) +static inline void nfsd_stats_rc_misses_inc(struct nfsd_net *nn) { - percpu_counter_inc(&nfsdstats.counter[NFSD_STATS_RC_MISSES]); + percpu_counter_inc(&nn->counter[NFSD_STATS_RC_MISSES]); } -static inline void nfsd_stats_rc_nocache_inc(void) +static inline void nfsd_stats_rc_nocache_inc(struct nfsd_net *nn) { - percpu_counter_inc(&nfsdstats.counter[NFSD_STATS_RC_NOCACHE]); + percpu_counter_inc(&nn->counter[NFSD_STATS_RC_NOCACHE]); } -static inline void nfsd_stats_fh_stale_inc(struct svc_export *exp) +static inline void nfsd_stats_fh_stale_inc(struct nfsd_net *nn, + struct svc_export *exp) { - percpu_counter_inc(&nfsdstats.counter[NFSD_STATS_FH_STALE]); + percpu_counter_inc(&nn->counter[NFSD_STATS_FH_STALE]); if (exp && exp->ex_stats) percpu_counter_inc(&exp->ex_stats->counter[EXP_STATS_FH_STALE]); } -static inline void nfsd_stats_io_read_add(struct svc_export *exp, s64 amount) +static inline void nfsd_stats_io_read_add(struct nfsd_net *nn, + struct svc_export *exp, s64 amount) { - percpu_counter_add(&nfsdstats.counter[NFSD_STATS_IO_READ], amount); + percpu_counter_add(&nn->counter[NFSD_STATS_IO_READ], amount); if (exp && exp->ex_stats) percpu_counter_add(&exp->ex_stats->counter[EXP_STATS_IO_READ], amount); } -static inline void nfsd_stats_io_write_add(struct svc_export *exp, s64 amount) +static inline void nfsd_stats_io_write_add(struct nfsd_net *nn, + struct svc_export *exp, s64 amount) { - percpu_counter_add(&nfsdstats.counter[NFSD_STATS_IO_WRITE], amount); + percpu_counter_add(&nn->counter[NFSD_STATS_IO_WRITE], amount); if (exp && exp->ex_stats) percpu_counter_add(&exp->ex_stats->counter[EXP_STATS_IO_WRITE], amount); } @@ -97,9 +81,9 @@ static inline void nfsd_stats_drc_mem_usage_sub(struct nfsd_net *nn, s64 amount) } #ifdef CONFIG_NFSD_V4 -static inline void nfsd_stats_wdeleg_getattr_inc(void) +static inline void nfsd_stats_wdeleg_getattr_inc(struct nfsd_net *nn) { - percpu_counter_inc(&nfsdstats.counter[NFSD_STATS_WDELEG_GETATTR]); + percpu_counter_inc(&nn->counter[NFSD_STATS_WDELEG_GETATTR]); } #endif #endif /* _NFSD_STATS_H */ diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index f57749cd6f0b1a..38952105ed7fd4 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1002,7 +1002,9 @@ static __be32 nfsd_finish_read(struct svc_rqst *rqstp, struct svc_fh *fhp, unsigned long *count, u32 *eof, ssize_t host_err) { if (host_err >= 0) { - nfsd_stats_io_read_add(fhp->fh_export, host_err); + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + + nfsd_stats_io_read_add(nn, fhp->fh_export, host_err); *eof = nfsd_eof_on_read(file, offset, host_err, *count); *count = host_err; fsnotify_access(file); @@ -1185,7 +1187,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf, goto out_nfserr; } *cnt = host_err; - nfsd_stats_io_write_add(exp, *cnt); + nfsd_stats_io_write_add(nn, exp, *cnt); fsnotify_modify(file); host_err = filemap_check_wb_err(file->f_mapping, since); if (host_err < 0) From 39ed42f221ad514767163d3c5fb5701fb5be47ef Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 26 Jan 2024 10:39:48 -0500 Subject: [PATCH 279/707] nfsd: remove nfsd_stats, make th_cnt a global counter This is the last global stat, take it out of the nfsd_stats struct and make it a global part of nfsd, report it the same as always. Signed-off-by: Josef Bacik Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/nfsd.h | 1 + fs/nfsd/nfssvc.c | 5 +++-- fs/nfsd/stats.c | 3 +-- fs/nfsd/stats.h | 6 ------ 4 files changed, 5 insertions(+), 10 deletions(-) diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index 304e9728b929a0..be2ea3d6d2a289 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -86,6 +86,7 @@ extern struct mutex nfsd_mutex; extern spinlock_t nfsd_drc_lock; extern unsigned long nfsd_drc_max_mem; extern unsigned long nfsd_drc_mem_used; +extern atomic_t nfsd_th_cnt; /* number of available threads */ extern const struct seq_operations nfs_exports_op; diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index d98a6abad99010..fdb59189643044 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -34,6 +34,7 @@ #define NFSDDBG_FACILITY NFSDDBG_SVC +atomic_t nfsd_th_cnt = ATOMIC_INIT(0); extern struct svc_program nfsd_program; static int nfsd(void *vrqstp); #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) @@ -924,7 +925,7 @@ nfsd(void *vrqstp) current->fs->umask = 0; - atomic_inc(&nfsdstats.th_cnt); + atomic_inc(&nfsd_th_cnt); set_freezable(); @@ -940,7 +941,7 @@ nfsd(void *vrqstp) nfsd_file_net_dispose(nn); } - atomic_dec(&nfsdstats.th_cnt); + atomic_dec(&nfsd_th_cnt); out: /* Release the thread */ diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c index 44e275324b06e5..3a7f791c30528d 100644 --- a/fs/nfsd/stats.c +++ b/fs/nfsd/stats.c @@ -27,7 +27,6 @@ #include "nfsd.h" -struct nfsd_stats nfsdstats; struct svc_stat nfsd_svcstats = { .program = &nfsd_program, }; @@ -47,7 +46,7 @@ static int nfsd_show(struct seq_file *seq, void *v) percpu_counter_sum_positive(&nn->counter[NFSD_STATS_IO_WRITE])); /* thread usage: */ - seq_printf(seq, "th %u 0", atomic_read(&nfsdstats.th_cnt)); + seq_printf(seq, "th %u 0", atomic_read(&nfsd_th_cnt)); /* deprecated thread usage histogram stats */ for (i = 0; i < 10; i++) diff --git a/fs/nfsd/stats.h b/fs/nfsd/stats.h index c24be4ddbe7d70..5675d283a53730 100644 --- a/fs/nfsd/stats.h +++ b/fs/nfsd/stats.h @@ -10,12 +10,6 @@ #include #include -struct nfsd_stats { - atomic_t th_cnt; /* number of available threads */ -}; - -extern struct nfsd_stats nfsdstats; - extern struct svc_stat nfsd_svcstats; int nfsd_percpu_counters_init(struct percpu_counter *counters, int num); From 52ebfd3e83cda597a4ea431a682e4424d4cadea1 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 26 Jan 2024 10:39:49 -0500 Subject: [PATCH 280/707] nfsd: make svc_stat per-network namespace instead of global The final bit of stats that is global is the rpc svc_stat. Move this into the nfsd_net struct and use that everywhere instead of the global struct. Remove the unused global struct. Signed-off-by: Josef Bacik Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/netns.h | 4 ++++ fs/nfsd/nfsctl.c | 2 ++ fs/nfsd/nfssvc.c | 2 +- fs/nfsd/stats.c | 10 ++++------ fs/nfsd/stats.h | 2 -- 5 files changed, 11 insertions(+), 9 deletions(-) diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index 0cef4bb407a9c6..afc16ee4da7428 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -14,6 +14,7 @@ #include #include #include +#include /* Hash tables for nfs4_clientid state */ #define CLIENT_HASH_BITS 4 @@ -179,6 +180,9 @@ struct nfsd_net { /* Per-netns stats counters */ struct percpu_counter counter[NFSD_STATS_COUNTERS_NUM]; + /* sunrpc svc stats */ + struct svc_stat nfsd_svcstats; + /* longest hash chain seen */ unsigned int longest_chain; diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index ea3c8114245c28..5a5547bd6ecf7e 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1674,6 +1674,8 @@ static __net_init int nfsd_net_init(struct net *net) retval = nfsd_stat_counters_init(nn); if (retval) goto out_repcache_error; + memset(&nn->nfsd_svcstats, 0, sizeof(nn->nfsd_svcstats)); + nn->nfsd_svcstats.program = &nfsd_program; nn->nfsd_versions = NULL; nn->nfsd4_minorversions = NULL; nfsd4_init_leases_net(nn); diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index fdb59189643044..c0d17b92b249f7 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -661,7 +661,7 @@ int nfsd_create_serv(struct net *net) if (nfsd_max_blksize == 0) nfsd_max_blksize = nfsd_get_default_max_blksize(); nfsd_reset_versions(nn); - serv = svc_create_pooled(&nfsd_program, &nfsd_svcstats, + serv = svc_create_pooled(&nfsd_program, &nn->nfsd_svcstats, nfsd_max_blksize, nfsd); if (serv == NULL) return -ENOMEM; diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c index 3a7f791c30528d..be52fb1e928ed6 100644 --- a/fs/nfsd/stats.c +++ b/fs/nfsd/stats.c @@ -27,10 +27,6 @@ #include "nfsd.h" -struct svc_stat nfsd_svcstats = { - .program = &nfsd_program, -}; - static int nfsd_show(struct seq_file *seq, void *v) { struct net *net = pde_data(file_inode(seq->file)); @@ -56,7 +52,7 @@ static int nfsd_show(struct seq_file *seq, void *v) seq_puts(seq, "\nra 0 0 0 0 0 0 0 0 0 0 0 0\n"); /* show my rpc info */ - svc_seq_show(seq, &nfsd_svcstats); + svc_seq_show(seq, &nn->nfsd_svcstats); #ifdef CONFIG_NFSD_V4 /* Show count for individual nfsv4 operations */ @@ -121,7 +117,9 @@ void nfsd_stat_counters_destroy(struct nfsd_net *nn) void nfsd_proc_stat_init(struct net *net) { - svc_proc_register(net, &nfsd_svcstats, &nfsd_proc_ops); + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + svc_proc_register(net, &nn->nfsd_svcstats, &nfsd_proc_ops); } void nfsd_proc_stat_shutdown(struct net *net) diff --git a/fs/nfsd/stats.h b/fs/nfsd/stats.h index 5675d283a53730..d2753e975dfd34 100644 --- a/fs/nfsd/stats.h +++ b/fs/nfsd/stats.h @@ -10,8 +10,6 @@ #include #include -extern struct svc_stat nfsd_svcstats; - int nfsd_percpu_counters_init(struct percpu_counter *counters, int num); void nfsd_percpu_counters_reset(struct percpu_counter *counters, int num); void nfsd_percpu_counters_destroy(struct percpu_counter *counters, int num); From 3b818452f12d6d2e25c8e4f40cc88d6761e838ea Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 26 Jan 2024 12:45:17 -0500 Subject: [PATCH 281/707] NFSD: Reset cb_seq_status after NFS4ERR_DELAY I noticed that once an NFSv4.1 callback operation gets a NFS4ERR_DELAY status on CB_SEQUENCE and then the connection is lost, the callback client loops, resending it indefinitely. The switch arm in nfsd4_cb_sequence_done() that handles NFS4ERR_DELAY uses rpc_restart_call() to rearm the RPC state machine for the retransmit, but that path does not call the rpc_prepare_call callback again. Thus cb_seq_status is set to -10008 by the first NFS4ERR_DELAY result, but is never set back to 1 for the retransmits. nfsd4_cb_sequence_done() thinks it's getting nothing but a long series of CB_SEQUENCE NFS4ERR_DELAY replies. Fixes: 7ba6cad6c88f ("nfsd: New helper nfsd4_cb_sequence_done() for processing more cb errors") Reviewed-by: Jeff Layton Reviewed-by: Benjamin Coddington Signed-off-by: Chuck Lever --- fs/nfsd/nfs4callback.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 926c29879c6ab8..43b0a34a5d5b8a 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -1178,6 +1178,7 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback ret = false; break; case -NFS4ERR_DELAY: + cb->cb_seq_status = 1; if (!rpc_restart_call(task)) goto out; From 1d3befc1d9d5c705955315037055e4f9ea2003b7 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 26 Jan 2024 12:45:23 -0500 Subject: [PATCH 282/707] NFSD: Convert the callback workqueue to use delayed_work Normally, NFSv4 callback operations are supposed to be sent to the client as soon as they are queued up. In a moment, I will introduce a recovery path where the server has to wait for the client to reconnect. We don't want a hard busy wait here -- the callback should be requeued to try again in several milliseconds. For now, convert nfsd4_callback from struct work_struct to struct delayed_work, and queue with a zero delay argument. This should avoid behavior changes for current operation. Reviewed-by: Jeff Layton Reviewed-by: Benjamin Coddington Signed-off-by: Chuck Lever --- fs/nfsd/nfs4callback.c | 6 +++--- fs/nfsd/state.h | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 43b0a34a5d5b8a..1ed2512b364846 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -887,7 +887,7 @@ static struct workqueue_struct *callback_wq; static bool nfsd4_queue_cb(struct nfsd4_callback *cb) { - return queue_work(callback_wq, &cb->cb_work); + return queue_delayed_work(callback_wq, &cb->cb_work, 0); } static void nfsd41_cb_inflight_begin(struct nfs4_client *clp) @@ -1370,7 +1370,7 @@ static void nfsd4_run_cb_work(struct work_struct *work) { struct nfsd4_callback *cb = - container_of(work, struct nfsd4_callback, cb_work); + container_of(work, struct nfsd4_callback, cb_work.work); struct nfs4_client *clp = cb->cb_clp; struct rpc_clnt *clnt; int flags; @@ -1415,7 +1415,7 @@ void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp, cb->cb_msg.rpc_argp = cb; cb->cb_msg.rpc_resp = cb; cb->cb_ops = ops; - INIT_WORK(&cb->cb_work, nfsd4_run_cb_work); + INIT_DELAYED_WORK(&cb->cb_work, nfsd4_run_cb_work); cb->cb_seq_status = 1; cb->cb_status = 0; cb->cb_need_restart = false; diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 41bdc913fa715b..87c4372ba36a8d 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -68,7 +68,7 @@ struct nfsd4_callback { struct nfs4_client *cb_clp; struct rpc_message cb_msg; const struct nfsd4_callback_ops *cb_ops; - struct work_struct cb_work; + struct delayed_work cb_work; int cb_seq_status; int cb_status; bool cb_need_restart; From 4bfc8cc603d73d757dafed5eb270aca6d189543e Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 26 Jan 2024 12:45:29 -0500 Subject: [PATCH 283/707] NFSD: Reschedule CB operations when backchannel rpc_clnt is shut down As part of managing a client disconnect, NFSD closes down and replaces the backchannel rpc_clnt. If a callback operation is pending when the backchannel rpc_clnt is shut down, currently nfsd4_run_cb_work() just discards that callback. But there are multiple cases to deal with here: o The client's lease is getting destroyed. Throw the CB away. o The client disconnected. It might be forcing a retransmit of CB operations, or it could have disconnected for other reasons. Reschedule the CB so it is retransmitted when the client reconnects. Since callback operations can now be rescheduled, ensure that cb_ops->prepare can be called only once by moving the cb_ops->prepare paragraph down to just before the rpc_call_async() call. Fixes: 2bbfed98a4d8 ("nfsd: Fix races between nfsd4_cb_release() and nfsd4_shutdown_callback()") Reviewed-by: Jeff Layton Reviewed-by: Benjamin Coddington Signed-off-by: Chuck Lever --- fs/nfsd/nfs4callback.c | 32 +++++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 1ed2512b364846..389d05985c5230 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -890,6 +890,13 @@ static bool nfsd4_queue_cb(struct nfsd4_callback *cb) return queue_delayed_work(callback_wq, &cb->cb_work, 0); } +static void nfsd4_queue_cb_delayed(struct nfsd4_callback *cb, + unsigned long msecs) +{ + queue_delayed_work(callback_wq, &cb->cb_work, + msecs_to_jiffies(msecs)); +} + static void nfsd41_cb_inflight_begin(struct nfs4_client *clp) { atomic_inc(&clp->cl_cb_inflight); @@ -1375,20 +1382,21 @@ nfsd4_run_cb_work(struct work_struct *work) struct rpc_clnt *clnt; int flags; - if (cb->cb_need_restart) { - cb->cb_need_restart = false; - } else { - if (cb->cb_ops && cb->cb_ops->prepare) - cb->cb_ops->prepare(cb); - } - if (clp->cl_flags & NFSD4_CLIENT_CB_FLAG_MASK) nfsd4_process_cb_update(cb); clnt = clp->cl_cb_client; if (!clnt) { - /* Callback channel broken, or client killed; give up: */ - nfsd41_destroy_cb(cb); + if (test_bit(NFSD4_CLIENT_CB_KILL, &clp->cl_flags)) + nfsd41_destroy_cb(cb); + else { + /* + * XXX: Ideally, we could wait for the client to + * reconnect, but I haven't figured out how + * to do that yet. + */ + nfsd4_queue_cb_delayed(cb, 25); + } return; } @@ -1401,6 +1409,12 @@ nfsd4_run_cb_work(struct work_struct *work) return; } + if (cb->cb_need_restart) { + cb->cb_need_restart = false; + } else { + if (cb->cb_ops && cb->cb_ops->prepare) + cb->cb_ops->prepare(cb); + } cb->cb_msg.rpc_cred = clp->cl_cb_cred; flags = clp->cl_minorversion ? RPC_TASK_NOCONNECT : RPC_TASK_SOFTCONN; rpc_call_async(clnt, &cb->cb_msg, RPC_TASK_SOFT | flags, From fa8fbe5970231df591aca7d9075c7da4ec85b473 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 26 Jan 2024 12:45:36 -0500 Subject: [PATCH 284/707] NFSD: Retransmit callbacks after client reconnects NFSv4.1 clients assume that if they disconnect, that will force the server to resend pending callback operations once a fresh connection has been established. Turns out NFSD has not been resending after reconnect. Fixes: 7ba6cad6c88f ("nfsd: New helper nfsd4_cb_sequence_done() for processing more cb errors") Reviewed-by: Jeff Layton Reviewed-by: Benjamin Coddington Signed-off-by: Chuck Lever --- fs/nfsd/nfs4callback.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 389d05985c5230..3bff14241b3cc5 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -1178,12 +1178,21 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback break; case -ESERVERFAULT: ++session->se_cb_seq_nr; - fallthrough; + nfsd4_mark_cb_fault(cb->cb_clp, cb->cb_seq_status); + ret = false; + break; case 1: + /* + * cb_seq_status remains 1 if an RPC Reply was never + * received. NFSD can't know if the client processed + * the CB_SEQUENCE operation. Ask the client to send a + * DESTROY_SESSION to recover. + */ + fallthrough; case -NFS4ERR_BADSESSION: nfsd4_mark_cb_fault(cb->cb_clp, cb->cb_seq_status); ret = false; - break; + goto need_restart; case -NFS4ERR_DELAY: cb->cb_seq_status = 1; if (!rpc_restart_call(task)) From 5c2be2ed0f80683c8773b379b0af2a97f194a1a2 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 26 Jan 2024 12:45:42 -0500 Subject: [PATCH 285/707] NFSD: Add nfsd_seq4_status trace event Add a trace point that records SEQ4_STATUS flags returned in an NFSv4.1 SEQUENCE response. SEQ4_STATUS flags report backchannel issues and changes to lease state to clients. Knowing what the server is reporting to clients is useful for debugging both configuration and operational issues in real time. For example, upcoming patches will enable server administrators to revoke parts of a client's lease; that revocation is indicated to the client when a subsequent SEQUENCE operation has one or more SEQ4_STATUS flags that are set. Sample trace records: nfsd-927 [006] 615.581821: nfsd_seq4_status: xid=0x095ded07 sessionid=65a032c3:b7845faf:00000001:00000000 status_flags=BACKCHANNEL_FAULT nfsd-927 [006] 615.588043: nfsd_seq4_status: xid=0x0a5ded07 sessionid=65a032c3:b7845faf:00000001:00000000 status_flags=BACKCHANNEL_FAULT nfsd-928 [003] 615.588448: nfsd_seq4_status: xid=0x0b5ded07 sessionid=65a032c3:b7845faf:00000001:00000000 status_flags=BACKCHANNEL_FAULT Reviewed-by: Jeff Layton Reviewed-by: Benjamin Coddington Signed-off-by: Chuck Lever --- fs/nfsd/nfs4state.c | 1 + fs/nfsd/trace.h | 35 +++++++++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 82f456bc42a4cf..ae9b5a3a585f96 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -4058,6 +4058,7 @@ nfsd4_sequence(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, } if (!list_empty(&clp->cl_revoked)) seq->status_flags |= SEQ4_STATUS_RECALLABLE_STATE_REVOKED; + trace_nfsd_seq4_status(rqstp, seq); out_no_session: if (conn) free_conn(conn); diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index d1e8cf079b0f4b..38d11b43779c77 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -696,6 +696,41 @@ DEFINE_EVENT(nfsd_stid_class, nfsd_stid_##name, \ DEFINE_STID_EVENT(revoke); +TRACE_EVENT_CONDITION(nfsd_seq4_status, + TP_PROTO( + const struct svc_rqst *rqstp, + const struct nfsd4_sequence *sequence + ), + TP_ARGS(rqstp, sequence), + TP_CONDITION(sequence->status_flags), + TP_STRUCT__entry( + __field(unsigned int, netns_ino) + __field(u32, xid) + __field(u32, cl_boot) + __field(u32, cl_id) + __field(u32, seqno) + __field(u32, reserved) + __field(unsigned long, status_flags) + ), + TP_fast_assign( + const struct nfsd4_sessionid *sid = + (struct nfsd4_sessionid *)&sequence->sessionid; + + __entry->netns_ino = SVC_NET(rqstp)->ns.inum; + __entry->xid = be32_to_cpu(rqstp->rq_xid); + __entry->cl_boot = sid->clientid.cl_boot; + __entry->cl_id = sid->clientid.cl_id; + __entry->seqno = sid->sequence; + __entry->reserved = sid->reserved; + __entry->status_flags = sequence->status_flags; + ), + TP_printk("xid=0x%08x sessionid=%08x:%08x:%08x:%08x status_flags=%s", + __entry->xid, __entry->cl_boot, __entry->cl_id, + __entry->seqno, __entry->reserved, + show_nfs4_seq4_status(__entry->status_flags) + ) +); + DECLARE_EVENT_CLASS(nfsd_clientid_class, TP_PROTO(const clientid_t *clid), TP_ARGS(clid), From 2a2547f8a32216f0e1079b61b85790e8567a4565 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 26 Jan 2024 12:45:48 -0500 Subject: [PATCH 286/707] NFSD: Replace dprintks in nfsd4_cb_sequence_done() Improve observability of backchannel session operation. Reviewed-by: Jeff Layton Reviewed-by: Benjamin Coddington Signed-off-by: Chuck Lever --- fs/nfsd/nfs4callback.c | 9 +++-- fs/nfsd/trace.h | 82 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 86 insertions(+), 5 deletions(-) diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 3bff14241b3cc5..78d9939cf4b093 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -1165,6 +1165,8 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback if (!cb->cb_holds_slot) goto need_restart; + /* This is the operation status code for CB_SEQUENCE */ + trace_nfsd_cb_seq_status(task, cb); switch (cb->cb_seq_status) { case 0: /* @@ -1210,13 +1212,10 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback break; default: nfsd4_mark_cb_fault(cb->cb_clp, cb->cb_seq_status); - dprintk("%s: unprocessed error %d\n", __func__, - cb->cb_seq_status); } - nfsd41_cb_release_slot(cb); - dprintk("%s: freed slot, new seqid=%d\n", __func__, - clp->cl_cb_session->se_cb_seq_nr); + + trace_nfsd_cb_free_slot(task, cb); if (RPC_SIGNALLED(task)) goto need_restart; diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index 38d11b43779c77..c134c755ae5d1e 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -9,8 +9,10 @@ #define _NFSD_TRACE_H #include +#include #include #include +#include #include "export.h" #include "nfsfh.h" @@ -1440,6 +1442,86 @@ TRACE_EVENT(nfsd_cb_setup_err, __entry->error) ); +TRACE_EVENT(nfsd_cb_seq_status, + TP_PROTO( + const struct rpc_task *task, + const struct nfsd4_callback *cb + ), + TP_ARGS(task, cb), + TP_STRUCT__entry( + __field(unsigned int, task_id) + __field(unsigned int, client_id) + __field(u32, cl_boot) + __field(u32, cl_id) + __field(u32, seqno) + __field(u32, reserved) + __field(int, tk_status) + __field(int, seq_status) + ), + TP_fast_assign( + const struct nfs4_client *clp = cb->cb_clp; + const struct nfsd4_session *session = clp->cl_cb_session; + const struct nfsd4_sessionid *sid = + (struct nfsd4_sessionid *)&session->se_sessionid; + + __entry->task_id = task->tk_pid; + __entry->client_id = task->tk_client ? + task->tk_client->cl_clid : -1; + __entry->cl_boot = sid->clientid.cl_boot; + __entry->cl_id = sid->clientid.cl_id; + __entry->seqno = sid->sequence; + __entry->reserved = sid->reserved; + __entry->tk_status = task->tk_status; + __entry->seq_status = cb->cb_seq_status; + ), + TP_printk(SUNRPC_TRACE_TASK_SPECIFIER + " sessionid=%08x:%08x:%08x:%08x tk_status=%d seq_status=%d\n", + __entry->task_id, __entry->client_id, + __entry->cl_boot, __entry->cl_id, + __entry->seqno, __entry->reserved, + __entry->tk_status, __entry->seq_status + ) +); + +TRACE_EVENT(nfsd_cb_free_slot, + TP_PROTO( + const struct rpc_task *task, + const struct nfsd4_callback *cb + ), + TP_ARGS(task, cb), + TP_STRUCT__entry( + __field(unsigned int, task_id) + __field(unsigned int, client_id) + __field(u32, cl_boot) + __field(u32, cl_id) + __field(u32, seqno) + __field(u32, reserved) + __field(u32, slot_seqno) + ), + TP_fast_assign( + const struct nfs4_client *clp = cb->cb_clp; + const struct nfsd4_session *session = clp->cl_cb_session; + const struct nfsd4_sessionid *sid = + (struct nfsd4_sessionid *)&session->se_sessionid; + + __entry->task_id = task->tk_pid; + __entry->client_id = task->tk_client ? + task->tk_client->cl_clid : -1; + __entry->cl_boot = sid->clientid.cl_boot; + __entry->cl_id = sid->clientid.cl_id; + __entry->seqno = sid->sequence; + __entry->reserved = sid->reserved; + __entry->slot_seqno = session->se_cb_seq_nr; + ), + TP_printk(SUNRPC_TRACE_TASK_SPECIFIER + " sessionid=%08x:%08x:%08x:%08x new slot seqno=%u\n", + __entry->task_id, __entry->client_id, + __entry->cl_boot, __entry->cl_id, + __entry->seqno, __entry->reserved, + __entry->slot_seqno + ) +); + TRACE_EVENT_CONDITION(nfsd_cb_recall, TP_PROTO( const struct nfs4_stid *stid From d5927e2267c8f247f77e027be01b82407fbac04f Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 26 Jan 2024 12:45:54 -0500 Subject: [PATCH 287/707] NFSD: Rename nfsd_cb_state trace point Make it clear where backchannel state is updated. Example trace point output: kworker/u16:0-10 [006] 2800.080404: nfsd_cb_new_state: addr=192.168.122.6:0 client 65b3c5b8:f541f749 state=UP nfsd-940 [003] 2800.478368: nfsd_cb_new_state: addr=192.168.122.6:0 client 65b3c5b8:f541f749 state=UNKNOWN kworker/u16:0-10 [003] 2800.478828: nfsd_cb_new_state: addr=192.168.122.6:0 client 65b3c5b8:f541f749 state=DOWN kworker/u16:0-10 [005] 2802.039724: nfsd_cb_start: addr=192.168.122.6:0 client 65b3c5b8:f541f749 state=UP kworker/u16:0-10 [005] 2810.611452: nfsd_cb_start: addr=192.168.122.6:0 client 65b3c5b8:f541f749 state=FAULT kworker/u16:0-10 [005] 2810.616832: nfsd_cb_start: addr=192.168.122.6:0 client 65b3c5b8:f541f749 state=UNKNOWN kworker/u16:0-10 [005] 2810.616931: nfsd_cb_start: addr=192.168.122.6:0 client 65b3c5b8:f541f749 state=DOWN Reviewed-by: Jeff Layton Reviewed-by: Benjamin Coddington Signed-off-by: Chuck Lever --- fs/nfsd/nfs4callback.c | 4 +++- fs/nfsd/trace.h | 3 ++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 78d9939cf4b093..a63171ccfc2b88 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -1006,7 +1006,7 @@ static void nfsd4_mark_cb_state(struct nfs4_client *clp, int newstate) { if (clp->cl_cb_state != newstate) { clp->cl_cb_state = newstate; - trace_nfsd_cb_state(clp); + trace_nfsd_cb_new_state(clp); } } @@ -1390,6 +1390,8 @@ nfsd4_run_cb_work(struct work_struct *work) struct rpc_clnt *clnt; int flags; + trace_nfsd_cb_start(clp); + if (clp->cl_flags & NFSD4_CLIENT_CB_FLAG_MASK) nfsd4_process_cb_update(cb); diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index c134c755ae5d1e..6003af2bee33cb 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -1371,7 +1371,8 @@ DEFINE_EVENT(nfsd_cb_class, nfsd_cb_##name, \ TP_PROTO(const struct nfs4_client *clp), \ TP_ARGS(clp)) -DEFINE_NFSD_CB_EVENT(state); +DEFINE_NFSD_CB_EVENT(start); +DEFINE_NFSD_CB_EVENT(new_state); DEFINE_NFSD_CB_EVENT(probe); DEFINE_NFSD_CB_EVENT(lost); DEFINE_NFSD_CB_EVENT(shutdown); From 035544bdf2ac5f0ab809e3b87cebeae5ae47035d Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 26 Jan 2024 12:46:01 -0500 Subject: [PATCH 288/707] NFSD: Add callback operation lifetime trace points Help observe the flow of callback operations. bc_shutdown() records exactly when the backchannel RPC client is destroyed and cl_cb_client is replaced with NULL. Examples include: nfsd-955 [004] 650.013997: nfsd_cb_queue: addr=192.168.122.6:0 client 65b3c5b8:f541f749 cb=0xffff8881134b02f8 (first try) kworker/u21:4-497 [004] 650.014050: nfsd_cb_seq_status: task:00000001@00000001 sessionid=65b3c5b8:f541f749:00000001:00000000 tk_status=-107 seq_status=1 kworker/u21:4-497 [004] 650.014051: nfsd_cb_restart: addr=192.168.122.6:0 client 65b3c5b8:f541f749 cb=0xffff88810e39f400 (first try) kworker/u21:4-497 [004] 650.014066: nfsd_cb_queue: addr=192.168.122.6:0 client 65b3c5b8:f541f749 cb=0xffff88810e39f400 (need restart) kworker/u16:0-10 [006] 650.065750: nfsd_cb_start: addr=192.168.122.6:0 client 65b3c5b8:f541f749 state=UNKNOWN kworker/u16:0-10 [006] 650.065752: nfsd_cb_bc_update: addr=192.168.122.6:0 client 65b3c5b8:f541f749 cb=0xffff8881134b02f8 (first try) kworker/u16:0-10 [006] 650.065754: nfsd_cb_bc_shutdown: addr=192.168.122.6:0 client 65b3c5b8:f541f749 cb=0xffff8881134b02f8 (first try) kworker/u16:0-10 [006] 650.065810: nfsd_cb_new_state: addr=192.168.122.6:0 client 65b3c5b8:f541f749 state=DOWN Reviewed-by: Jeff Layton Reviewed-by: Benjamin Coddington Signed-off-by: Chuck Lever --- fs/nfsd/nfs4callback.c | 8 ++++++++ fs/nfsd/trace.h | 42 ++++++++++++++++++++++++++++++++++++++++ include/trace/misc/nfs.h | 34 ++++++++++++++++++++++++++++++++ 3 files changed, 84 insertions(+) diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index a63171ccfc2b88..b50ce54aa1bfab 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -887,12 +887,14 @@ static struct workqueue_struct *callback_wq; static bool nfsd4_queue_cb(struct nfsd4_callback *cb) { + trace_nfsd_cb_queue(cb->cb_clp, cb); return queue_delayed_work(callback_wq, &cb->cb_work, 0); } static void nfsd4_queue_cb_delayed(struct nfsd4_callback *cb, unsigned long msecs) { + trace_nfsd_cb_queue(cb->cb_clp, cb); queue_delayed_work(callback_wq, &cb->cb_work, msecs_to_jiffies(msecs)); } @@ -1113,6 +1115,7 @@ static void nfsd41_destroy_cb(struct nfsd4_callback *cb) { struct nfs4_client *clp = cb->cb_clp; + trace_nfsd_cb_destroy(clp, cb); nfsd41_cb_release_slot(cb); if (cb->cb_ops && cb->cb_ops->release) cb->cb_ops->release(cb); @@ -1227,6 +1230,7 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback goto out; need_restart: if (!test_bit(NFSD4_CLIENT_CB_KILL, &clp->cl_flags)) { + trace_nfsd_cb_restart(clp, cb); task->tk_status = 0; cb->cb_need_restart = true; } @@ -1340,11 +1344,14 @@ static void nfsd4_process_cb_update(struct nfsd4_callback *cb) struct nfsd4_conn *c; int err; + trace_nfsd_cb_bc_update(clp, cb); + /* * This is either an update, or the client dying; in either case, * kill the old client: */ if (clp->cl_cb_client) { + trace_nfsd_cb_bc_shutdown(clp, cb); rpc_shutdown_client(clp->cl_cb_client); clp->cl_cb_client = NULL; put_cred(clp->cl_cb_cred); @@ -1356,6 +1363,7 @@ static void nfsd4_process_cb_update(struct nfsd4_callback *cb) } if (test_bit(NFSD4_CLIENT_CB_KILL, &clp->cl_flags)) return; + spin_lock(&clp->cl_lock); /* * Only serialized callback code is allowed to clear these diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index 6003af2bee33cb..9f9e58debc2611 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -1443,6 +1443,48 @@ TRACE_EVENT(nfsd_cb_setup_err, __entry->error) ); +DECLARE_EVENT_CLASS(nfsd_cb_lifetime_class, + TP_PROTO( + const struct nfs4_client *clp, + const struct nfsd4_callback *cb + ), + TP_ARGS(clp, cb), + TP_STRUCT__entry( + __field(u32, cl_boot) + __field(u32, cl_id) + __field(const void *, cb) + __field(bool, need_restart) + __sockaddr(addr, clp->cl_cb_conn.cb_addrlen) + ), + TP_fast_assign( + __entry->cl_boot = clp->cl_clientid.cl_boot; + __entry->cl_id = clp->cl_clientid.cl_id; + __entry->cb = cb; + __entry->need_restart = cb->cb_need_restart; + __assign_sockaddr(addr, &clp->cl_cb_conn.cb_addr, + clp->cl_cb_conn.cb_addrlen) + ), + TP_printk("addr=%pISpc client %08x:%08x cb=%p%s", + __get_sockaddr(addr), __entry->cl_boot, __entry->cl_id, + __entry->cb, __entry->need_restart ? + " (need restart)" : " (first try)" + ) +); + +#define DEFINE_NFSD_CB_LIFETIME_EVENT(name) \ +DEFINE_EVENT(nfsd_cb_lifetime_class, nfsd_cb_##name, \ + TP_PROTO( \ + const struct nfs4_client *clp, \ + const struct nfsd4_callback *cb \ + ), \ + TP_ARGS(clp, cb)) + +DEFINE_NFSD_CB_LIFETIME_EVENT(queue); +DEFINE_NFSD_CB_LIFETIME_EVENT(destroy); +DEFINE_NFSD_CB_LIFETIME_EVENT(restart); +DEFINE_NFSD_CB_LIFETIME_EVENT(bc_update); +DEFINE_NFSD_CB_LIFETIME_EVENT(bc_shutdown); + TRACE_EVENT(nfsd_cb_seq_status, TP_PROTO( const struct rpc_task *task, diff --git a/include/trace/misc/nfs.h b/include/trace/misc/nfs.h index 0d9d48dca38a89..64ab5dac59ce0c 100644 --- a/include/trace/misc/nfs.h +++ b/include/trace/misc/nfs.h @@ -385,3 +385,37 @@ TRACE_DEFINE_ENUM(IOMODE_ANY); { SEQ4_STATUS_RESTART_RECLAIM_NEEDED, "RESTART_RECLAIM_NEEDED" }, \ { SEQ4_STATUS_CB_PATH_DOWN_SESSION, "CB_PATH_DOWN_SESSION" }, \ { SEQ4_STATUS_BACKCHANNEL_FAULT, "BACKCHANNEL_FAULT" }) + +TRACE_DEFINE_ENUM(OP_CB_GETATTR); +TRACE_DEFINE_ENUM(OP_CB_RECALL); +TRACE_DEFINE_ENUM(OP_CB_LAYOUTRECALL); +TRACE_DEFINE_ENUM(OP_CB_NOTIFY); +TRACE_DEFINE_ENUM(OP_CB_PUSH_DELEG); +TRACE_DEFINE_ENUM(OP_CB_RECALL_ANY); +TRACE_DEFINE_ENUM(OP_CB_RECALLABLE_OBJ_AVAIL); +TRACE_DEFINE_ENUM(OP_CB_RECALL_SLOT); +TRACE_DEFINE_ENUM(OP_CB_SEQUENCE); +TRACE_DEFINE_ENUM(OP_CB_WANTS_CANCELLED); +TRACE_DEFINE_ENUM(OP_CB_NOTIFY_LOCK); +TRACE_DEFINE_ENUM(OP_CB_NOTIFY_DEVICEID); +TRACE_DEFINE_ENUM(OP_CB_OFFLOAD); +TRACE_DEFINE_ENUM(OP_CB_ILLEGAL); + +#define show_nfs4_cb_op(x) \ + __print_symbolic(x, \ + { 0, "CB_NULL" }, \ + { 1, "CB_COMPOUND" }, \ + { OP_CB_GETATTR, "CB_GETATTR" }, \ + { OP_CB_RECALL, "CB_RECALL" }, \ + { OP_CB_LAYOUTRECALL, "CB_LAYOUTRECALL" }, \ + { OP_CB_NOTIFY, "CB_NOTIFY" }, \ + { OP_CB_PUSH_DELEG, "CB_PUSH_DELEG" }, \ + { OP_CB_RECALL_ANY, "CB_RECALL_ANY" }, \ + { OP_CB_RECALLABLE_OBJ_AVAIL, "CB_RECALLABLE_OBJ_AVAIL" }, \ + { OP_CB_RECALL_SLOT, "CB_RECALL_SLOT" }, \ + { OP_CB_SEQUENCE, "CB_SEQUENCE" }, \ + { OP_CB_WANTS_CANCELLED, "CB_WANTS_CANCELLED" }, \ + { OP_CB_NOTIFY_LOCK, "CB_NOTIFY_LOCK" }, \ + { OP_CB_NOTIFY_DEVICEID, "CB_NOTIFY_DEVICEID" }, \ + { OP_CB_OFFLOAD, "CB_OFFLOAD" }, \ + { OP_CB_ILLEGAL, "CB_ILLEGAL" }) From 33ad6a2f38e313cea2bf7bc56ea60215cddfacf8 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 26 Jan 2024 12:46:07 -0500 Subject: [PATCH 289/707] SUNRPC: Remove EXPORT_SYMBOL_GPL for svc_process_bc() svc_process_bc(), previously known as bc_svc_process(), was added in commit 4d6bbb6233c9 ("nfs41: Backchannel bc_svc_process()") but there has never been a call site outside of the sunrpc.ko module. Reviewed-by: Jeff Layton Reviewed-by: Benjamin Coddington Signed-off-by: Chuck Lever --- net/sunrpc/svc.c | 1 - 1 file changed, 1 deletion(-) diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 9bd8a868c1a70a..121d0739031b5a 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -1623,7 +1623,6 @@ void svc_process_bc(struct rpc_rqst *req, struct svc_rqst *rqstp) WARN_ON_ONCE(atomic_read(&task->tk_count) != 1); rpc_put_task(task); } -EXPORT_SYMBOL_GPL(svc_process_bc); #endif /* CONFIG_SUNRPC_BACKCHANNEL */ /** From 2ae8a993e913b7e071ec503c91f9304a38af16d9 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 26 Jan 2024 12:46:13 -0500 Subject: [PATCH 290/707] NFSD: Remove unused @reason argument Reviewed-by: Jeff Layton Reviewed-by: Benjamin Coddington Signed-off-by: Chuck Lever --- fs/nfsd/nfs4callback.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index b50ce54aa1bfab..45a31f05159598 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -45,7 +45,7 @@ #define NFSDDBG_FACILITY NFSDDBG_PROC -static void nfsd4_mark_cb_fault(struct nfs4_client *, int reason); +static void nfsd4_mark_cb_fault(struct nfs4_client *clp); #define NFSPROC4_CB_NULL 0 #define NFSPROC4_CB_COMPOUND 1 @@ -1012,14 +1012,14 @@ static void nfsd4_mark_cb_state(struct nfs4_client *clp, int newstate) } } -static void nfsd4_mark_cb_down(struct nfs4_client *clp, int reason) +static void nfsd4_mark_cb_down(struct nfs4_client *clp) { if (test_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_flags)) return; nfsd4_mark_cb_state(clp, NFSD4_CB_DOWN); } -static void nfsd4_mark_cb_fault(struct nfs4_client *clp, int reason) +static void nfsd4_mark_cb_fault(struct nfs4_client *clp) { if (test_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_flags)) return; @@ -1031,7 +1031,7 @@ static void nfsd4_cb_probe_done(struct rpc_task *task, void *calldata) struct nfs4_client *clp = container_of(calldata, struct nfs4_client, cl_cb_null); if (task->tk_status) - nfsd4_mark_cb_down(clp, task->tk_status); + nfsd4_mark_cb_down(clp); else nfsd4_mark_cb_state(clp, NFSD4_CB_UP); } @@ -1183,7 +1183,7 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback break; case -ESERVERFAULT: ++session->se_cb_seq_nr; - nfsd4_mark_cb_fault(cb->cb_clp, cb->cb_seq_status); + nfsd4_mark_cb_fault(cb->cb_clp); ret = false; break; case 1: @@ -1195,7 +1195,7 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback */ fallthrough; case -NFS4ERR_BADSESSION: - nfsd4_mark_cb_fault(cb->cb_clp, cb->cb_seq_status); + nfsd4_mark_cb_fault(cb->cb_clp); ret = false; goto need_restart; case -NFS4ERR_DELAY: @@ -1214,7 +1214,7 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback } break; default: - nfsd4_mark_cb_fault(cb->cb_clp, cb->cb_seq_status); + nfsd4_mark_cb_fault(cb->cb_clp); } nfsd41_cb_release_slot(cb); @@ -1260,7 +1260,7 @@ static void nfsd4_cb_done(struct rpc_task *task, void *calldata) case -EIO: case -ETIMEDOUT: case -EACCES: - nfsd4_mark_cb_down(clp, task->tk_status); + nfsd4_mark_cb_down(clp); } break; default: @@ -1382,7 +1382,7 @@ static void nfsd4_process_cb_update(struct nfsd4_callback *cb) err = setup_callback_client(clp, &conn, ses); if (err) { - nfsd4_mark_cb_down(clp, err); + nfsd4_mark_cb_down(clp); if (c) svc_xprt_put(c->cn_xprt); return; From 0275e5a9e5c217115c96dc026e90c5913483f24c Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 26 Jan 2024 12:46:20 -0500 Subject: [PATCH 291/707] NFSD: Replace comment with lockdep assertion Convert a code comment into a real assertion. Reviewed-by: Jeff Layton Reviewed-by: Benjamin Coddington Signed-off-by: Chuck Lever --- fs/nfsd/nfs4callback.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 45a31f05159598..d73c66fa131df7 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -1315,12 +1315,13 @@ void nfsd4_shutdown_callback(struct nfs4_client *clp) nfsd41_cb_inflight_wait_complete(clp); } -/* requires cl_lock: */ static struct nfsd4_conn * __nfsd4_find_backchannel(struct nfs4_client *clp) { struct nfsd4_session *s; struct nfsd4_conn *c; + lockdep_assert_held(&clp->cl_lock); + list_for_each_entry(s, &clp->cl_sessions, se_perclnt) { list_for_each_entry(c, &s->se_conns, cn_persession) { if (c->cn_flags & NFS4_CDFC4_BACK) From d7b0002517bbb67b5e0835c25fa3142d6224c621 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 26 Jan 2024 12:46:26 -0500 Subject: [PATCH 292/707] NFSD: Remove BUG_ON in nfsd4_process_cb_update() Don't kill the kworker thread, and don't panic while cl_lock is held. There's no need for scorching the earth here. Reviewed-by: Jeff Layton Reviewed-by: Benjamin Coddington Signed-off-by: Chuck Lever --- fs/nfsd/nfs4callback.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index d73c66fa131df7..fd6a27e20f65ba 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -1370,8 +1370,9 @@ static void nfsd4_process_cb_update(struct nfsd4_callback *cb) * Only serialized callback code is allowed to clear these * flags; main nfsd code can only set them: */ - BUG_ON(!(clp->cl_flags & NFSD4_CLIENT_CB_FLAG_MASK)); + WARN_ON(!(clp->cl_flags & NFSD4_CLIENT_CB_FLAG_MASK)); clear_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_flags); + memcpy(&conn, &cb->cb_clp->cl_cb_conn, sizeof(struct nfs4_cb_conn)); c = __nfsd4_find_backchannel(clp); if (c) { From e36b4d5f581085f88d2ddf6e143b211972e44bbd Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 26 Jan 2024 12:46:32 -0500 Subject: [PATCH 293/707] SUNRPC: Remove stale comments bc_close() and bc_destroy now do something, so the comments are no longer correct. Commit 6221f1d9b63f ("SUNRPC: Fix backchannel RPC soft lockups") should have removed these. Reviewed-by: Jeff Layton Reviewed-by: Benjamin Coddington Signed-off-by: Chuck Lever --- net/sunrpc/xprtsock.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 58f3dc8d0d71c3..d92c13e78a56cf 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -2987,20 +2987,11 @@ static int bc_send_request(struct rpc_rqst *req) return len; } -/* - * The close routine. Since this is client initiated, we do nothing - */ - static void bc_close(struct rpc_xprt *xprt) { xprt_disconnect_done(xprt); } -/* - * The xprt destroy routine. Again, because this connection is client - * initiated, we do nothing - */ - static void bc_destroy(struct rpc_xprt *xprt) { dprintk("RPC: bc_destroy xprt %p\n", xprt); From a670b682f0500a921682917422c947e2b4275df5 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 26 Jan 2024 12:46:38 -0500 Subject: [PATCH 294/707] NFSD: Remove redundant cb_seq_status initialization As far as I can see, setting cb_seq_status in nfsd4_init_cb() is superfluous because it is set again in nfsd4_cb_prepare(). Reviewed-by: Jeff Layton Reviewed-by: Benjamin Coddington Signed-off-by: Chuck Lever --- fs/nfsd/nfs4callback.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index fd6a27e20f65ba..32dd2fbb1f301b 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -1450,7 +1450,6 @@ void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp, cb->cb_msg.rpc_resp = cb; cb->cb_ops = ops; INIT_DELAYED_WORK(&cb->cb_work, nfsd4_run_cb_work); - cb->cb_seq_status = 1; cb->cb_status = 0; cb->cb_need_restart = false; cb->cb_holds_slot = false; From c345694e92d3cf5314d331b68edbcb9d743fdbd6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micka=C3=ABl=20Sala=C3=BCn?= Date: Thu, 25 Jan 2024 16:32:29 +0100 Subject: [PATCH 295/707] selftests/landlock: Fix capability for net_test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CAP_NET_ADMIN allows to configure network interfaces, not CAP_SYS_ADMIN which only allows to call unshare(2). Without this change, running network tests as a non-root user but with all capabilities would fail at the setup_loopback() step with "RTNETLINK answers: Operation not permitted". The issue is only visible when running tests with non-root users (i.e. only relying on ambient capabilities). Indeed, when configuring the network interface, the "ip" command is called, which may lead to the special handling of capabilities for the root user by execve(2). If root is the caller, then the inherited, permitted and effective capabilities are all reset, which then includes CAP_NET_ADMIN. However, if a non-root user is the caller, then ambient capabilities are masked by the inherited ones, which were explicitly dropped. To make execution deterministic whatever users are running the tests, set the noroot secure bit for each test, and set the inheritable and ambient capabilities to CAP_NET_ADMIN, the only capability that may be required after an execve(2). Factor out _effective_cap() into _change_cap(), and use it to manage ambient capabilities with the new set_ambient_cap() and clear_ambient_cap() helpers. This makes it possible to run all Landlock tests (including net_test) with uml-run.sh from https://github.com/landlock-lsm/landlock-test-tools Cc: Konstantin Meskhidze Fixes: a549d055a22e ("selftests/landlock: Add network tests") Link: https://lore.kernel.org/r/20240125153230.3817165-2-mic@digikod.net Signed-off-by: Mickaël Salaün --- tools/testing/selftests/landlock/common.h | 51 +++++++++++++++++---- tools/testing/selftests/landlock/net_test.c | 5 +- 2 files changed, 47 insertions(+), 9 deletions(-) diff --git a/tools/testing/selftests/landlock/common.h b/tools/testing/selftests/landlock/common.h index 5b79758cae6275..13597ebd3a64e3 100644 --- a/tools/testing/selftests/landlock/common.h +++ b/tools/testing/selftests/landlock/common.h @@ -9,6 +9,7 @@ #include #include +#include #include #include #include @@ -115,12 +116,20 @@ static void _init_caps(struct __test_metadata *const _metadata, bool drop_all) /* clang-format off */ CAP_DAC_OVERRIDE, CAP_MKNOD, + CAP_NET_ADMIN, + CAP_NET_BIND_SERVICE, CAP_SYS_ADMIN, CAP_SYS_CHROOT, - CAP_NET_BIND_SERVICE, /* clang-format on */ }; + /* + * As a safety guard, this should be called each time, but it will fail + * the second time when called by the same task (e.g. + * fs_test.c:layout0.unpriv), which is OK. + */ + cap_set_secbits(SECBIT_NOROOT); + cap_p = cap_get_proc(); EXPECT_NE(NULL, cap_p) { @@ -137,6 +146,8 @@ static void _init_caps(struct __test_metadata *const _metadata, bool drop_all) TH_LOG("Failed to cap_set_flag: %s", strerror(errno)); } } + + /* Automatically resets ambient capabilities. */ EXPECT_NE(-1, cap_set_proc(cap_p)) { TH_LOG("Failed to cap_set_proc: %s", strerror(errno)); @@ -145,6 +156,9 @@ static void _init_caps(struct __test_metadata *const _metadata, bool drop_all) { TH_LOG("Failed to cap_free: %s", strerror(errno)); } + + /* Quickly checks that ambient capabilities are cleared. */ + EXPECT_NE(-1, cap_get_ambient(caps[0])); } /* We cannot put such helpers in a library because of kselftest_harness.h . */ @@ -158,8 +172,9 @@ static void __maybe_unused drop_caps(struct __test_metadata *const _metadata) _init_caps(_metadata, true); } -static void _effective_cap(struct __test_metadata *const _metadata, - const cap_value_t caps, const cap_flag_value_t value) +static void _change_cap(struct __test_metadata *const _metadata, + const cap_flag_t flag, const cap_value_t cap, + const cap_flag_value_t value) { cap_t cap_p; @@ -168,7 +183,7 @@ static void _effective_cap(struct __test_metadata *const _metadata, { TH_LOG("Failed to cap_get_proc: %s", strerror(errno)); } - EXPECT_NE(-1, cap_set_flag(cap_p, CAP_EFFECTIVE, 1, &caps, value)) + EXPECT_NE(-1, cap_set_flag(cap_p, flag, 1, &cap, value)) { TH_LOG("Failed to cap_set_flag: %s", strerror(errno)); } @@ -183,15 +198,35 @@ static void _effective_cap(struct __test_metadata *const _metadata, } static void __maybe_unused set_cap(struct __test_metadata *const _metadata, - const cap_value_t caps) + const cap_value_t cap) { - _effective_cap(_metadata, caps, CAP_SET); + _change_cap(_metadata, CAP_EFFECTIVE, cap, CAP_SET); } static void __maybe_unused clear_cap(struct __test_metadata *const _metadata, - const cap_value_t caps) + const cap_value_t cap) +{ + _change_cap(_metadata, CAP_EFFECTIVE, cap, CAP_CLEAR); +} + +static void __maybe_unused +set_ambient_cap(struct __test_metadata *const _metadata, const cap_value_t cap) +{ + _change_cap(_metadata, CAP_INHERITABLE, cap, CAP_SET); + + EXPECT_NE(-1, cap_set_ambient(cap, CAP_SET)) + { + TH_LOG("Failed to set ambient capability %d: %s", cap, + strerror(errno)); + } +} + +static void __maybe_unused clear_ambient_cap( + struct __test_metadata *const _metadata, const cap_value_t cap) { - _effective_cap(_metadata, caps, CAP_CLEAR); + EXPECT_EQ(1, cap_get_ambient(cap)); + _change_cap(_metadata, CAP_INHERITABLE, cap, CAP_CLEAR); + EXPECT_EQ(0, cap_get_ambient(cap)); } /* Receives an FD from a UNIX socket. Returns the received FD, or -errno. */ diff --git a/tools/testing/selftests/landlock/net_test.c b/tools/testing/selftests/landlock/net_test.c index efcde123af1f24..936cfc879f1d2c 100644 --- a/tools/testing/selftests/landlock/net_test.c +++ b/tools/testing/selftests/landlock/net_test.c @@ -107,8 +107,11 @@ static void setup_loopback(struct __test_metadata *const _metadata) { set_cap(_metadata, CAP_SYS_ADMIN); ASSERT_EQ(0, unshare(CLONE_NEWNET)); - ASSERT_EQ(0, system("ip link set dev lo up")); clear_cap(_metadata, CAP_SYS_ADMIN); + + set_ambient_cap(_metadata, CAP_NET_ADMIN); + ASSERT_EQ(0, system("ip link set dev lo up")); + clear_ambient_cap(_metadata, CAP_NET_ADMIN); } static bool is_restricted(const struct protocol_variant *const prot, From 8d3a9e03af2f66277abdc38ab28969500585d620 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micka=C3=ABl=20Sala=C3=BCn?= Date: Thu, 25 Jan 2024 16:32:30 +0100 Subject: [PATCH 296/707] selftests/landlock: Clean up error logs related to capabilities MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It doesn't help to call TH_LOG() for every cap_*() error. Let's only log errors returned by the kernel, not by libcap specificities. Link: https://lore.kernel.org/r/20240125153230.3817165-3-mic@digikod.net Signed-off-by: Mickaël Salaün --- tools/testing/selftests/landlock/common.h | 39 ++++++----------------- 1 file changed, 9 insertions(+), 30 deletions(-) diff --git a/tools/testing/selftests/landlock/common.h b/tools/testing/selftests/landlock/common.h index 13597ebd3a64e3..36fca11958b25a 100644 --- a/tools/testing/selftests/landlock/common.h +++ b/tools/testing/selftests/landlock/common.h @@ -131,31 +131,19 @@ static void _init_caps(struct __test_metadata *const _metadata, bool drop_all) cap_set_secbits(SECBIT_NOROOT); cap_p = cap_get_proc(); - EXPECT_NE(NULL, cap_p) - { - TH_LOG("Failed to cap_get_proc: %s", strerror(errno)); - } - EXPECT_NE(-1, cap_clear(cap_p)) - { - TH_LOG("Failed to cap_clear: %s", strerror(errno)); - } + EXPECT_NE(NULL, cap_p); + EXPECT_NE(-1, cap_clear(cap_p)); if (!drop_all) { EXPECT_NE(-1, cap_set_flag(cap_p, CAP_PERMITTED, - ARRAY_SIZE(caps), caps, CAP_SET)) - { - TH_LOG("Failed to cap_set_flag: %s", strerror(errno)); - } + ARRAY_SIZE(caps), caps, CAP_SET)); } /* Automatically resets ambient capabilities. */ EXPECT_NE(-1, cap_set_proc(cap_p)) { - TH_LOG("Failed to cap_set_proc: %s", strerror(errno)); - } - EXPECT_NE(-1, cap_free(cap_p)) - { - TH_LOG("Failed to cap_free: %s", strerror(errno)); + TH_LOG("Failed to set capabilities: %s", strerror(errno)); } + EXPECT_NE(-1, cap_free(cap_p)); /* Quickly checks that ambient capabilities are cleared. */ EXPECT_NE(-1, cap_get_ambient(caps[0])); @@ -179,22 +167,13 @@ static void _change_cap(struct __test_metadata *const _metadata, cap_t cap_p; cap_p = cap_get_proc(); - EXPECT_NE(NULL, cap_p) - { - TH_LOG("Failed to cap_get_proc: %s", strerror(errno)); - } - EXPECT_NE(-1, cap_set_flag(cap_p, flag, 1, &cap, value)) - { - TH_LOG("Failed to cap_set_flag: %s", strerror(errno)); - } + EXPECT_NE(NULL, cap_p); + EXPECT_NE(-1, cap_set_flag(cap_p, flag, 1, &cap, value)); EXPECT_NE(-1, cap_set_proc(cap_p)) { - TH_LOG("Failed to cap_set_proc: %s", strerror(errno)); - } - EXPECT_NE(-1, cap_free(cap_p)) - { - TH_LOG("Failed to cap_free: %s", strerror(errno)); + TH_LOG("Failed to set capability %d: %s", cap, strerror(errno)); } + EXPECT_NE(-1, cap_free(cap_p)); } static void __maybe_unused set_cap(struct __test_metadata *const _metadata, From dfb1a0fcab9f7d294434bd67d98c0f9fb72f2ac6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micka=C3=ABl=20Sala=C3=BCn?= Date: Thu, 18 Jan 2024 12:36:32 +0100 Subject: [PATCH 297/707] landlock: Add support for KUnit tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add the SECURITY_LANDLOCK_KUNIT_TEST option to enable KUnit tests for Landlock. The minimal required configuration is listed in the security/landlock/.kunitconfig file. Add an initial landlock_fs KUnit test suite with 7 test cases for filesystem helpers. These are related to the LANDLOCK_ACCESS_FS_REFER right. There is one KUnit test case per: * mutated state (e.g. test_scope_to_request_*) or, * shared state between tests (e.g. test_is_eaccess_*). Add macros to improve readability of tests (i.e. one per line). Test cases are collocated with the tested functions to help maintenance and improve documentation. This is why SECURITY_LANDLOCK_KUNIT_TEST cannot be set as module. This is a nice complement to Landlock's user space kselftests. We expect new Landlock features to come with KUnit tests as well. Thanks to UML support, we can run all KUnit tests for Landlock with: ./tools/testing/kunit/kunit.py run --kunitconfig security/landlock [00:00:00] ======================= landlock_fs ======================= [00:00:00] [PASSED] test_no_more_access [00:00:00] [PASSED] test_scope_to_request_with_exec_none [00:00:00] [PASSED] test_scope_to_request_with_exec_some [00:00:00] [PASSED] test_scope_to_request_without_access [00:00:00] [PASSED] test_is_eacces_with_none [00:00:00] [PASSED] test_is_eacces_with_refer [00:00:00] [PASSED] test_is_eacces_with_write [00:00:00] =================== [PASSED] landlock_fs =================== [00:00:00] ============================================================ [00:00:00] Testing complete. Ran 7 tests: passed: 7 Cc: Konstantin Meskhidze Reviewed-by: Günther Noack Link: https://lore.kernel.org/r/20240118113632.1948478-1-mic@digikod.net Signed-off-by: Mickaël Salaün --- security/landlock/.kunitconfig | 4 + security/landlock/Kconfig | 15 ++ security/landlock/common.h | 2 + security/landlock/fs.c | 234 +++++++++++++++++++ tools/testing/kunit/configs/all_tests.config | 1 + 5 files changed, 256 insertions(+) create mode 100644 security/landlock/.kunitconfig diff --git a/security/landlock/.kunitconfig b/security/landlock/.kunitconfig new file mode 100644 index 00000000000000..03e11946660429 --- /dev/null +++ b/security/landlock/.kunitconfig @@ -0,0 +1,4 @@ +CONFIG_KUNIT=y +CONFIG_SECURITY=y +CONFIG_SECURITY_LANDLOCK=y +CONFIG_SECURITY_LANDLOCK_KUNIT_TEST=y diff --git a/security/landlock/Kconfig b/security/landlock/Kconfig index c4bf0d5eff39f3..3f1493402052ec 100644 --- a/security/landlock/Kconfig +++ b/security/landlock/Kconfig @@ -20,3 +20,18 @@ config SECURITY_LANDLOCK If you are unsure how to answer this question, answer N. Otherwise, you should also prepend "landlock," to the content of CONFIG_LSM to enable Landlock at boot time. + +config SECURITY_LANDLOCK_KUNIT_TEST + bool "KUnit tests for Landlock" if !KUNIT_ALL_TESTS + depends on KUNIT=y + depends on SECURITY_LANDLOCK + default KUNIT_ALL_TESTS + help + Build KUnit tests for Landlock. + + See the KUnit documentation in Documentation/dev-tools/kunit + + Run all KUnit tests for Landlock with: + ./tools/testing/kunit/kunit.py run --kunitconfig security/landlock + + If you are unsure how to answer this question, answer N. diff --git a/security/landlock/common.h b/security/landlock/common.h index 5dc0fe15707d6b..0eb1d34c2eaefe 100644 --- a/security/landlock/common.h +++ b/security/landlock/common.h @@ -17,4 +17,6 @@ #define pr_fmt(fmt) LANDLOCK_NAME ": " fmt +#define BIT_INDEX(bit) HWEIGHT(bit - 1) + #endif /* _SECURITY_LANDLOCK_COMMON_H */ diff --git a/security/landlock/fs.c b/security/landlock/fs.c index fc520a06f9af10..73997e63734f93 100644 --- a/security/landlock/fs.c +++ b/security/landlock/fs.c @@ -7,6 +7,7 @@ * Copyright © 2021-2022 Microsoft Corporation */ +#include #include #include #include @@ -311,6 +312,119 @@ static bool no_more_access( return true; } +#define NMA_TRUE(...) KUNIT_EXPECT_TRUE(test, no_more_access(__VA_ARGS__)) +#define NMA_FALSE(...) KUNIT_EXPECT_FALSE(test, no_more_access(__VA_ARGS__)) + +#ifdef CONFIG_SECURITY_LANDLOCK_KUNIT_TEST + +static void test_no_more_access(struct kunit *const test) +{ + const layer_mask_t rx0[LANDLOCK_NUM_ACCESS_FS] = { + [BIT_INDEX(LANDLOCK_ACCESS_FS_EXECUTE)] = BIT_ULL(0), + [BIT_INDEX(LANDLOCK_ACCESS_FS_READ_FILE)] = BIT_ULL(0), + }; + const layer_mask_t mx0[LANDLOCK_NUM_ACCESS_FS] = { + [BIT_INDEX(LANDLOCK_ACCESS_FS_EXECUTE)] = BIT_ULL(0), + [BIT_INDEX(LANDLOCK_ACCESS_FS_MAKE_REG)] = BIT_ULL(0), + }; + const layer_mask_t x0[LANDLOCK_NUM_ACCESS_FS] = { + [BIT_INDEX(LANDLOCK_ACCESS_FS_EXECUTE)] = BIT_ULL(0), + }; + const layer_mask_t x1[LANDLOCK_NUM_ACCESS_FS] = { + [BIT_INDEX(LANDLOCK_ACCESS_FS_EXECUTE)] = BIT_ULL(1), + }; + const layer_mask_t x01[LANDLOCK_NUM_ACCESS_FS] = { + [BIT_INDEX(LANDLOCK_ACCESS_FS_EXECUTE)] = BIT_ULL(0) | + BIT_ULL(1), + }; + const layer_mask_t allows_all[LANDLOCK_NUM_ACCESS_FS] = {}; + + /* Checks without restriction. */ + NMA_TRUE(&x0, &allows_all, false, &allows_all, NULL, false); + NMA_TRUE(&allows_all, &x0, false, &allows_all, NULL, false); + NMA_FALSE(&x0, &x0, false, &allows_all, NULL, false); + + /* + * Checks that we can only refer a file if no more access could be + * inherited. + */ + NMA_TRUE(&x0, &x0, false, &rx0, NULL, false); + NMA_TRUE(&rx0, &rx0, false, &rx0, NULL, false); + NMA_FALSE(&rx0, &rx0, false, &x0, NULL, false); + NMA_FALSE(&rx0, &rx0, false, &x1, NULL, false); + + /* Checks allowed referring with different nested domains. */ + NMA_TRUE(&x0, &x1, false, &x0, NULL, false); + NMA_TRUE(&x1, &x0, false, &x0, NULL, false); + NMA_TRUE(&x0, &x01, false, &x0, NULL, false); + NMA_TRUE(&x0, &x01, false, &rx0, NULL, false); + NMA_TRUE(&x01, &x0, false, &x0, NULL, false); + NMA_TRUE(&x01, &x0, false, &rx0, NULL, false); + NMA_FALSE(&x01, &x01, false, &x0, NULL, false); + + /* Checks that file access rights are also enforced for a directory. */ + NMA_FALSE(&rx0, &rx0, true, &x0, NULL, false); + + /* Checks that directory access rights don't impact file referring... */ + NMA_TRUE(&mx0, &mx0, false, &x0, NULL, false); + /* ...but only directory referring. */ + NMA_FALSE(&mx0, &mx0, true, &x0, NULL, false); + + /* Checks directory exchange. */ + NMA_TRUE(&mx0, &mx0, true, &mx0, &mx0, true); + NMA_TRUE(&mx0, &mx0, true, &mx0, &x0, true); + NMA_FALSE(&mx0, &mx0, true, &x0, &mx0, true); + NMA_FALSE(&mx0, &mx0, true, &x0, &x0, true); + NMA_FALSE(&mx0, &mx0, true, &x1, &x1, true); + + /* Checks file exchange with directory access rights... */ + NMA_TRUE(&mx0, &mx0, false, &mx0, &mx0, false); + NMA_TRUE(&mx0, &mx0, false, &mx0, &x0, false); + NMA_TRUE(&mx0, &mx0, false, &x0, &mx0, false); + NMA_TRUE(&mx0, &mx0, false, &x0, &x0, false); + /* ...and with file access rights. */ + NMA_TRUE(&rx0, &rx0, false, &rx0, &rx0, false); + NMA_TRUE(&rx0, &rx0, false, &rx0, &x0, false); + NMA_FALSE(&rx0, &rx0, false, &x0, &rx0, false); + NMA_FALSE(&rx0, &rx0, false, &x0, &x0, false); + NMA_FALSE(&rx0, &rx0, false, &x1, &x1, false); + + /* + * Allowing the following requests should not be a security risk + * because domain 0 denies execute access, and domain 1 is always + * nested with domain 0. However, adding an exception for this case + * would mean to check all nested domains to make sure none can get + * more privileges (e.g. processes only sandboxed by domain 0). + * Moreover, this behavior (i.e. composition of N domains) could then + * be inconsistent compared to domain 1's ruleset alone (e.g. it might + * be denied to link/rename with domain 1's ruleset, whereas it would + * be allowed if nested on top of domain 0). Another drawback would be + * to create a cover channel that could enable sandboxed processes to + * infer most of the filesystem restrictions from their domain. To + * make it simple, efficient, safe, and more consistent, this case is + * always denied. + */ + NMA_FALSE(&x1, &x1, false, &x0, NULL, false); + NMA_FALSE(&x1, &x1, false, &rx0, NULL, false); + NMA_FALSE(&x1, &x1, true, &x0, NULL, false); + NMA_FALSE(&x1, &x1, true, &rx0, NULL, false); + + /* Checks the same case of exclusive domains with a file... */ + NMA_TRUE(&x1, &x1, false, &x01, NULL, false); + NMA_FALSE(&x1, &x1, false, &x01, &x0, false); + NMA_FALSE(&x1, &x1, false, &x01, &x01, false); + NMA_FALSE(&x1, &x1, false, &x0, &x0, false); + /* ...and with a directory. */ + NMA_FALSE(&x1, &x1, false, &x0, &x0, true); + NMA_FALSE(&x1, &x1, true, &x0, &x0, false); + NMA_FALSE(&x1, &x1, true, &x0, &x0, true); +} + +#endif /* CONFIG_SECURITY_LANDLOCK_KUNIT_TEST */ + +#undef NMA_TRUE +#undef NMA_FALSE + /* * Removes @layer_masks accesses that are not requested. * @@ -331,6 +445,57 @@ scope_to_request(const access_mask_t access_request, return !memchr_inv(layer_masks, 0, sizeof(*layer_masks)); } +#ifdef CONFIG_SECURITY_LANDLOCK_KUNIT_TEST + +static void test_scope_to_request_with_exec_none(struct kunit *const test) +{ + /* Allows everything. */ + layer_mask_t layer_masks[LANDLOCK_NUM_ACCESS_FS] = {}; + + /* Checks and scopes with execute. */ + KUNIT_EXPECT_TRUE(test, scope_to_request(LANDLOCK_ACCESS_FS_EXECUTE, + &layer_masks)); + KUNIT_EXPECT_EQ(test, 0, + layer_masks[BIT_INDEX(LANDLOCK_ACCESS_FS_EXECUTE)]); + KUNIT_EXPECT_EQ(test, 0, + layer_masks[BIT_INDEX(LANDLOCK_ACCESS_FS_WRITE_FILE)]); +} + +static void test_scope_to_request_with_exec_some(struct kunit *const test) +{ + /* Denies execute and write. */ + layer_mask_t layer_masks[LANDLOCK_NUM_ACCESS_FS] = { + [BIT_INDEX(LANDLOCK_ACCESS_FS_EXECUTE)] = BIT_ULL(0), + [BIT_INDEX(LANDLOCK_ACCESS_FS_WRITE_FILE)] = BIT_ULL(1), + }; + + /* Checks and scopes with execute. */ + KUNIT_EXPECT_FALSE(test, scope_to_request(LANDLOCK_ACCESS_FS_EXECUTE, + &layer_masks)); + KUNIT_EXPECT_EQ(test, BIT_ULL(0), + layer_masks[BIT_INDEX(LANDLOCK_ACCESS_FS_EXECUTE)]); + KUNIT_EXPECT_EQ(test, 0, + layer_masks[BIT_INDEX(LANDLOCK_ACCESS_FS_WRITE_FILE)]); +} + +static void test_scope_to_request_without_access(struct kunit *const test) +{ + /* Denies execute and write. */ + layer_mask_t layer_masks[LANDLOCK_NUM_ACCESS_FS] = { + [BIT_INDEX(LANDLOCK_ACCESS_FS_EXECUTE)] = BIT_ULL(0), + [BIT_INDEX(LANDLOCK_ACCESS_FS_WRITE_FILE)] = BIT_ULL(1), + }; + + /* Checks and scopes without access request. */ + KUNIT_EXPECT_TRUE(test, scope_to_request(0, &layer_masks)); + KUNIT_EXPECT_EQ(test, 0, + layer_masks[BIT_INDEX(LANDLOCK_ACCESS_FS_EXECUTE)]); + KUNIT_EXPECT_EQ(test, 0, + layer_masks[BIT_INDEX(LANDLOCK_ACCESS_FS_WRITE_FILE)]); +} + +#endif /* CONFIG_SECURITY_LANDLOCK_KUNIT_TEST */ + /* * Returns true if there is at least one access right different than * LANDLOCK_ACCESS_FS_REFER. @@ -354,6 +519,51 @@ is_eacces(const layer_mask_t (*const layer_masks)[LANDLOCK_NUM_ACCESS_FS], return false; } +#define IE_TRUE(...) KUNIT_EXPECT_TRUE(test, is_eacces(__VA_ARGS__)) +#define IE_FALSE(...) KUNIT_EXPECT_FALSE(test, is_eacces(__VA_ARGS__)) + +#ifdef CONFIG_SECURITY_LANDLOCK_KUNIT_TEST + +static void test_is_eacces_with_none(struct kunit *const test) +{ + const layer_mask_t layer_masks[LANDLOCK_NUM_ACCESS_FS] = {}; + + IE_FALSE(&layer_masks, 0); + IE_FALSE(&layer_masks, LANDLOCK_ACCESS_FS_REFER); + IE_FALSE(&layer_masks, LANDLOCK_ACCESS_FS_EXECUTE); + IE_FALSE(&layer_masks, LANDLOCK_ACCESS_FS_WRITE_FILE); +} + +static void test_is_eacces_with_refer(struct kunit *const test) +{ + const layer_mask_t layer_masks[LANDLOCK_NUM_ACCESS_FS] = { + [BIT_INDEX(LANDLOCK_ACCESS_FS_REFER)] = BIT_ULL(0), + }; + + IE_FALSE(&layer_masks, 0); + IE_FALSE(&layer_masks, LANDLOCK_ACCESS_FS_REFER); + IE_FALSE(&layer_masks, LANDLOCK_ACCESS_FS_EXECUTE); + IE_FALSE(&layer_masks, LANDLOCK_ACCESS_FS_WRITE_FILE); +} + +static void test_is_eacces_with_write(struct kunit *const test) +{ + const layer_mask_t layer_masks[LANDLOCK_NUM_ACCESS_FS] = { + [BIT_INDEX(LANDLOCK_ACCESS_FS_WRITE_FILE)] = BIT_ULL(0), + }; + + IE_FALSE(&layer_masks, 0); + IE_FALSE(&layer_masks, LANDLOCK_ACCESS_FS_REFER); + IE_FALSE(&layer_masks, LANDLOCK_ACCESS_FS_EXECUTE); + + IE_TRUE(&layer_masks, LANDLOCK_ACCESS_FS_WRITE_FILE); +} + +#endif /* CONFIG_SECURITY_LANDLOCK_KUNIT_TEST */ + +#undef IE_TRUE +#undef IE_FALSE + /** * is_access_to_paths_allowed - Check accesses for requests with a common path * @@ -1225,3 +1435,27 @@ __init void landlock_add_fs_hooks(void) security_add_hooks(landlock_hooks, ARRAY_SIZE(landlock_hooks), &landlock_lsmid); } + +#ifdef CONFIG_SECURITY_LANDLOCK_KUNIT_TEST + +/* clang-format off */ +static struct kunit_case test_cases[] = { + KUNIT_CASE(test_no_more_access), + KUNIT_CASE(test_scope_to_request_with_exec_none), + KUNIT_CASE(test_scope_to_request_with_exec_some), + KUNIT_CASE(test_scope_to_request_without_access), + KUNIT_CASE(test_is_eacces_with_none), + KUNIT_CASE(test_is_eacces_with_refer), + KUNIT_CASE(test_is_eacces_with_write), + {} +}; +/* clang-format on */ + +static struct kunit_suite test_suite = { + .name = "landlock_fs", + .test_cases = test_cases, +}; + +kunit_test_suite(test_suite); + +#endif /* CONFIG_SECURITY_LANDLOCK_KUNIT_TEST */ diff --git a/tools/testing/kunit/configs/all_tests.config b/tools/testing/kunit/configs/all_tests.config index 3bf506d4a63ccd..1b8f1abfedf041 100644 --- a/tools/testing/kunit/configs/all_tests.config +++ b/tools/testing/kunit/configs/all_tests.config @@ -37,6 +37,7 @@ CONFIG_REGMAP_BUILD=y CONFIG_SECURITY=y CONFIG_SECURITY_APPARMOR=y +CONFIG_SECURITY_LANDLOCK=y CONFIG_SOUND=y CONFIG_SND=y From 66ba1133d970a4e4a86e7fbba72c4a71aa233707 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=BCnther=20Noack?= Date: Fri, 8 Dec 2023 16:51:16 +0100 Subject: [PATCH 298/707] landlock: Add IOCTL access right MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduces the LANDLOCK_ACCESS_FS_IOCTL access right and increments the Landlock ABI version to 5. Like the truncate right, these rights are associated with a file descriptor at the time of open(2), and get respected even when the file descriptor is used outside of the thread which it was originally opened in. A newly enabled Landlock policy therefore does not apply to file descriptors which are already open. If the LANDLOCK_ACCESS_FS_IOCTL right is handled, only a small number of safe IOCTL commands will be permitted on newly opened files. The permitted IOCTLs can be configured through the ruleset in limited ways now. (See documentation for details.) Specifically, when LANDLOCK_ACCESS_FS_IOCTL is handled, granting this right on a file or directory will *not* permit to do all IOCTL commands, but only influence the IOCTL commands which are not already handled through other access rights. The intent is to keep the groups of IOCTL commands more fine-grained. Noteworthy scenarios which require special attention: TTY devices support IOCTLs like TIOCSTI and TIOCLINUX, which can be used to control shell processes on the same terminal which run at different privilege levels, which may make it possible to escape a sandbox. Because stdin, stdout and stderr are normally inherited rather than newly opened, IOCTLs are usually permitted on them even after the Landlock policy is enforced. Some legitimate file system features, like setting up fscrypt, are exposed as IOCTL commands on regular files and directories -- users of Landlock are advised to double check that the sandboxed process does not need to invoke these IOCTLs. Known limitations: The LANDLOCK_ACCESS_FS_IOCTL access right is a coarse-grained control over IOCTL commands. Future work will enable a more fine-grained access control for IOCTLs. In the meantime, Landlock users may use path-based restrictions in combination with their knowledge about the file system layout to control what IOCTLs can be done. Mounting file systems with the nodev option can help to distinguish regular files and devices, and give guarantees about the affected files, which Landlock alone can not give yet. Signed-off-by: Günther Noack Link: https://lore.kernel.org/r/20231208155121.1943775-5-gnoack@google.com Signed-off-by: Mickaël Salaün --- include/uapi/linux/landlock.h | 58 +++++- security/landlock/fs.c | 176 ++++++++++++++++++- security/landlock/fs.h | 2 + security/landlock/limits.h | 11 +- security/landlock/ruleset.h | 2 +- security/landlock/syscalls.c | 19 +- tools/testing/selftests/landlock/base_test.c | 2 +- tools/testing/selftests/landlock/fs_test.c | 5 +- 8 files changed, 253 insertions(+), 22 deletions(-) diff --git a/include/uapi/linux/landlock.h b/include/uapi/linux/landlock.h index 25c8d76775393a..578f268b084b70 100644 --- a/include/uapi/linux/landlock.h +++ b/include/uapi/linux/landlock.h @@ -128,7 +128,7 @@ struct landlock_net_port_attr { * files and directories. Files or directories opened before the sandboxing * are not subject to these restrictions. * - * A file can only receive these access rights: + * The following access rights apply only to files: * * - %LANDLOCK_ACCESS_FS_EXECUTE: Execute a file. * - %LANDLOCK_ACCESS_FS_WRITE_FILE: Open a file with write access. Note that @@ -138,12 +138,13 @@ struct landlock_net_port_attr { * - %LANDLOCK_ACCESS_FS_READ_FILE: Open a file with read access. * - %LANDLOCK_ACCESS_FS_TRUNCATE: Truncate a file with :manpage:`truncate(2)`, * :manpage:`ftruncate(2)`, :manpage:`creat(2)`, or :manpage:`open(2)` with - * ``O_TRUNC``. Whether an opened file can be truncated with - * :manpage:`ftruncate(2)` is determined during :manpage:`open(2)`, in the - * same way as read and write permissions are checked during - * :manpage:`open(2)` using %LANDLOCK_ACCESS_FS_READ_FILE and - * %LANDLOCK_ACCESS_FS_WRITE_FILE. This access right is available since the - * third version of the Landlock ABI. + * ``O_TRUNC``. This access right is available since the third version of the + * Landlock ABI. + * + * Whether an opened file can be truncated with :manpage:`ftruncate(2)` or used + * with `ioctl(2)` is determined during :manpage:`open(2)`, in the same way as + * read and write permissions are checked during :manpage:`open(2)` using + * %LANDLOCK_ACCESS_FS_READ_FILE and %LANDLOCK_ACCESS_FS_WRITE_FILE. * * A directory can receive access rights related to files or directories. The * following access right is applied to the directory itself, and the @@ -198,13 +199,53 @@ struct landlock_net_port_attr { * If multiple requirements are not met, the ``EACCES`` error code takes * precedence over ``EXDEV``. * + * The following access right applies both to files and directories: + * + * - %LANDLOCK_ACCESS_FS_IOCTL: Invoke :manpage:`ioctl(2)` commands on an opened + * file or directory. + * + * This access right applies to all :manpage:`ioctl(2)` commands, except of + * ``FIOCLEX``, ``FIONCLEX``, ``FIONBIO`` and ``FIOASYNC``. These commands + * continue to be invokable independent of the %LANDLOCK_ACCESS_FS_IOCTL + * access right. + * + * When certain other access rights are handled in the ruleset, in addition to + * %LANDLOCK_ACCESS_FS_IOCTL, granting these access rights will unlock access + * to additional groups of IOCTL commands, on the affected files: + * + * * %LANDLOCK_ACCESS_FS_READ_FILE unlocks access to ``FIOQSIZE``, + * ``FS_IOC_FIEMAP``, ``FIBMAP``, ``FIGETBSZ``, ``FIONREAD``, + * ``FIDEDUPRANGE``. + * + * * %LANDLOCK_ACCESS_FS_WRITE_FILE unlocks access to ``FIOQSIZE``, + * ``FS_IOC_FIEMAP``, ``FIBMAP``, ``FIGETBSZ``, ``FICLONE``, + * ``FICLONERANGE``, ``FS_IOC_RESVSP``, ``FS_IOC_RESVSP64``, + * ``FS_IOC_UNRESVSP``, ``FS_IOC_UNRESVSP64``, ``FS_IOC_ZERO_RANGE``. + * + * * %LANDLOCK_ACCESS_FS_READ_DIR unlocks access to ``FIOQSIZE``, + * ``FS_IOC_FIEMAP``, ``FIBMAP``, ``FIGETBSZ``. + * + * When these access rights are handled in the ruleset, the availability of + * the affected IOCTL commands is not governed by %LANDLOCK_ACCESS_FS_IOCTL + * any more, but by the respective access right. + * + * All other IOCTL commands are not handled specially, and are governed by + * %LANDLOCK_ACCESS_FS_IOCTL. This includes %FS_IOC_GETFLAGS and + * %FS_IOC_SETFLAGS for manipulating inode flags (:manpage:`ioctl_iflags(2)`), + * %FS_IOC_FSFETXATTR and %FS_IOC_FSSETXATTR for manipulating extended + * attributes, as well as %FIFREEZE and %FITHAW for freezing and thawing file + * systems. + * + * This access right is available since the fifth version of the Landlock + * ABI. + * * .. warning:: * * It is currently not possible to restrict some file-related actions * accessible through these syscall families: :manpage:`chdir(2)`, * :manpage:`stat(2)`, :manpage:`flock(2)`, :manpage:`chmod(2)`, * :manpage:`chown(2)`, :manpage:`setxattr(2)`, :manpage:`utime(2)`, - * :manpage:`ioctl(2)`, :manpage:`fcntl(2)`, :manpage:`access(2)`. + * :manpage:`fcntl(2)`, :manpage:`access(2)`. * Future Landlock evolutions will enable to restrict them. */ /* clang-format off */ @@ -223,6 +264,7 @@ struct landlock_net_port_attr { #define LANDLOCK_ACCESS_FS_MAKE_SYM (1ULL << 12) #define LANDLOCK_ACCESS_FS_REFER (1ULL << 13) #define LANDLOCK_ACCESS_FS_TRUNCATE (1ULL << 14) +#define LANDLOCK_ACCESS_FS_IOCTL (1ULL << 15) /* clang-format on */ /** diff --git a/security/landlock/fs.c b/security/landlock/fs.c index 73997e63734f93..90f7f6db1e87dc 100644 --- a/security/landlock/fs.c +++ b/security/landlock/fs.c @@ -7,6 +7,7 @@ * Copyright © 2021-2022 Microsoft Corporation */ +#include #include #include #include @@ -14,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -29,6 +31,7 @@ #include #include #include +#include #include #include "common.h" @@ -84,6 +87,145 @@ static const struct landlock_object_underops landlock_fs_underops = { .release = release_inode }; +/* IOCTL helpers */ + +/* + * These are synthetic access rights, which are only used within the kernel, but + * not exposed to callers in userspace. The mapping between these access rights + * and IOCTL commands is defined in the required_ioctl_access() helper function. + */ +#define LANDLOCK_ACCESS_FS_IOCTL_GROUP1 (LANDLOCK_LAST_PUBLIC_ACCESS_FS << 1) +#define LANDLOCK_ACCESS_FS_IOCTL_GROUP2 (LANDLOCK_LAST_PUBLIC_ACCESS_FS << 2) +#define LANDLOCK_ACCESS_FS_IOCTL_GROUP3 (LANDLOCK_LAST_PUBLIC_ACCESS_FS << 3) +#define LANDLOCK_ACCESS_FS_IOCTL_GROUP4 (LANDLOCK_LAST_PUBLIC_ACCESS_FS << 4) + +/* ioctl_groups - all synthetic access rights for IOCTL command groups */ +/* clang-format off */ +#define IOCTL_GROUPS ( \ + LANDLOCK_ACCESS_FS_IOCTL_GROUP1 | \ + LANDLOCK_ACCESS_FS_IOCTL_GROUP2 | \ + LANDLOCK_ACCESS_FS_IOCTL_GROUP3 | \ + LANDLOCK_ACCESS_FS_IOCTL_GROUP4) +/* clang-format on */ + +static_assert((IOCTL_GROUPS & LANDLOCK_MASK_ACCESS_FS) == IOCTL_GROUPS); + +/** + * required_ioctl_access(): Determine required IOCTL access rights. + * + * @cmd: The IOCTL command that is supposed to be run. + * + * Returns: The access rights that must be granted on an opened file in order to + * use the given @cmd. + */ +static access_mask_t required_ioctl_access(unsigned int cmd) +{ + switch (cmd) { + case FIOCLEX: + case FIONCLEX: + case FIONBIO: + case FIOASYNC: + /* + * FIOCLEX, FIONCLEX, FIONBIO and FIOASYNC manipulate the FD's + * close-on-exec and the file's buffered-IO and async flags. + * These operations are also available through fcntl(2), + * and are unconditionally permitted in Landlock. + */ + return 0; + case FIOQSIZE: + return LANDLOCK_ACCESS_FS_IOCTL_GROUP1; + case FS_IOC_FIEMAP: + case FIBMAP: + case FIGETBSZ: + return LANDLOCK_ACCESS_FS_IOCTL_GROUP2; + case FIONREAD: + case FIDEDUPERANGE: + return LANDLOCK_ACCESS_FS_IOCTL_GROUP3; + case FICLONE: + case FICLONERANGE: + case FS_IOC_RESVSP: + case FS_IOC_RESVSP64: + case FS_IOC_UNRESVSP: + case FS_IOC_UNRESVSP64: + case FS_IOC_ZERO_RANGE: + return LANDLOCK_ACCESS_FS_IOCTL_GROUP4; + default: + /* + * Other commands are guarded by the catch-all access right. + */ + return LANDLOCK_ACCESS_FS_IOCTL; + } +} + +/** + * expand_ioctl() - Return the dst flags from either the src flag or the + * %LANDLOCK_ACCESS_FS_IOCTL flag, depending on whether the + * %LANDLOCK_ACCESS_FS_IOCTL and src access rights are handled or not. + * + * @handled: Handled access rights. + * @access: The access mask to copy values from. + * @src: A single access right to copy from in @access. + * @dst: One or more access rights to copy to. + * + * Returns: @dst, or 0. + */ +static access_mask_t expand_ioctl(const access_mask_t handled, + const access_mask_t access, + const access_mask_t src, + const access_mask_t dst) +{ + access_mask_t copy_from; + + if (!(handled & LANDLOCK_ACCESS_FS_IOCTL)) + return 0; + + copy_from = (handled & src) ? src : LANDLOCK_ACCESS_FS_IOCTL; + if (access & copy_from) + return dst; + + return 0; +} + +/** + * landlock_expand_access_fs() - Returns @access with the synthetic IOCTL group + * flags enabled if necessary. + * + * @handled: Handled FS access rights. + * @access: FS access rights to expand. + * + * Returns: @access expanded by the necessary flags for the synthetic IOCTL + * access rights. + */ +static access_mask_t landlock_expand_access_fs(const access_mask_t handled, + const access_mask_t access) +{ + return access | + expand_ioctl(handled, access, LANDLOCK_ACCESS_FS_WRITE_FILE, + LANDLOCK_ACCESS_FS_IOCTL_GROUP1 | + LANDLOCK_ACCESS_FS_IOCTL_GROUP2 | + LANDLOCK_ACCESS_FS_IOCTL_GROUP4) | + expand_ioctl(handled, access, LANDLOCK_ACCESS_FS_READ_FILE, + LANDLOCK_ACCESS_FS_IOCTL_GROUP1 | + LANDLOCK_ACCESS_FS_IOCTL_GROUP2 | + LANDLOCK_ACCESS_FS_IOCTL_GROUP3) | + expand_ioctl(handled, access, LANDLOCK_ACCESS_FS_READ_DIR, + LANDLOCK_ACCESS_FS_IOCTL_GROUP1); +} + +/** + * landlock_expand_handled_access_fs() - add synthetic IOCTL access rights to an + * access mask of handled accesses. + * + * @handled: The handled accesses of a ruleset that is being created. + * + * Returns: @handled, with the bits for the synthetic IOCTL access rights set, + * if %LANDLOCK_ACCESS_FS_IOCTL is handled. + */ +access_mask_t landlock_expand_handled_access_fs(const access_mask_t handled) +{ + return landlock_expand_access_fs(handled, handled); +} + /* Ruleset management */ static struct landlock_object *get_inode_object(struct inode *const inode) @@ -148,7 +290,8 @@ static struct landlock_object *get_inode_object(struct inode *const inode) LANDLOCK_ACCESS_FS_EXECUTE | \ LANDLOCK_ACCESS_FS_WRITE_FILE | \ LANDLOCK_ACCESS_FS_READ_FILE | \ - LANDLOCK_ACCESS_FS_TRUNCATE) + LANDLOCK_ACCESS_FS_TRUNCATE | \ + LANDLOCK_ACCESS_FS_IOCTL) /* clang-format on */ /* @@ -158,6 +301,7 @@ int landlock_append_fs_rule(struct landlock_ruleset *const ruleset, const struct path *const path, access_mask_t access_rights) { + access_mask_t handled; int err; struct landlock_id id = { .type = LANDLOCK_KEY_INODE, @@ -170,9 +314,11 @@ int landlock_append_fs_rule(struct landlock_ruleset *const ruleset, if (WARN_ON_ONCE(ruleset->num_layers != 1)) return -EINVAL; + handled = landlock_get_fs_access_mask(ruleset, 0); + /* Expands the synthetic IOCTL groups. */ + access_rights |= landlock_expand_access_fs(handled, access_rights); /* Transforms relative access rights to absolute ones. */ - access_rights |= LANDLOCK_MASK_ACCESS_FS & - ~landlock_get_fs_access_mask(ruleset, 0); + access_rights |= LANDLOCK_MASK_ACCESS_FS & ~handled; id.key.object = get_inode_object(d_backing_inode(path->dentry)); if (IS_ERR(id.key.object)) return PTR_ERR(id.key.object); @@ -1333,7 +1479,9 @@ static int hook_file_open(struct file *const file) { layer_mask_t layer_masks[LANDLOCK_NUM_ACCESS_FS] = {}; access_mask_t open_access_request, full_access_request, allowed_access; - const access_mask_t optional_access = LANDLOCK_ACCESS_FS_TRUNCATE; + const access_mask_t optional_access = LANDLOCK_ACCESS_FS_TRUNCATE | + LANDLOCK_ACCESS_FS_IOCTL | + IOCTL_GROUPS; const struct landlock_ruleset *const dom = get_current_fs_domain(); if (!dom) @@ -1406,6 +1554,25 @@ static int hook_file_truncate(struct file *const file) return -EACCES; } +static int hook_file_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + const access_mask_t required_access = required_ioctl_access(cmd); + const access_mask_t allowed_access = + landlock_file(file)->allowed_access; + + /* + * It is the access rights at the time of opening the file which + * determine whether IOCTL can be used on the opened file later. + * + * The access right is attached to the opened file in hook_file_open(). + */ + if ((allowed_access & required_access) == required_access) + return 0; + + return -EACCES; +} + static struct security_hook_list landlock_hooks[] __ro_after_init = { LSM_HOOK_INIT(inode_free_security, hook_inode_free_security), @@ -1428,6 +1595,7 @@ static struct security_hook_list landlock_hooks[] __ro_after_init = { LSM_HOOK_INIT(file_alloc_security, hook_file_alloc_security), LSM_HOOK_INIT(file_open, hook_file_open), LSM_HOOK_INIT(file_truncate, hook_file_truncate), + LSM_HOOK_INIT(file_ioctl, hook_file_ioctl), }; __init void landlock_add_fs_hooks(void) diff --git a/security/landlock/fs.h b/security/landlock/fs.h index 488e4813680ab7..c88fe7bda37b69 100644 --- a/security/landlock/fs.h +++ b/security/landlock/fs.h @@ -92,4 +92,6 @@ int landlock_append_fs_rule(struct landlock_ruleset *const ruleset, const struct path *const path, access_mask_t access_hierarchy); +access_mask_t landlock_expand_handled_access_fs(const access_mask_t handled); + #endif /* _SECURITY_LANDLOCK_FS_H */ diff --git a/security/landlock/limits.h b/security/landlock/limits.h index 93c9c6f915567e..296795f8a5c127 100644 --- a/security/landlock/limits.h +++ b/security/landlock/limits.h @@ -18,7 +18,16 @@ #define LANDLOCK_MAX_NUM_LAYERS 16 #define LANDLOCK_MAX_NUM_RULES U32_MAX -#define LANDLOCK_LAST_ACCESS_FS LANDLOCK_ACCESS_FS_TRUNCATE +/* + * For file system access rights, Landlock distinguishes between the publicly + * visible access rights (1 to LANDLOCK_LAST_PUBLIC_ACCESS_FS) and the private + * ones which are not exposed to userspace (LANDLOCK_LAST_PUBLIC_ACCESS_FS + 1 + * to LANDLOCK_LAST_ACCESS_FS). The private access rights are defined in fs.c. + */ +#define LANDLOCK_LAST_PUBLIC_ACCESS_FS LANDLOCK_ACCESS_FS_IOCTL +#define LANDLOCK_MASK_PUBLIC_ACCESS_FS ((LANDLOCK_LAST_PUBLIC_ACCESS_FS << 1) - 1) + +#define LANDLOCK_LAST_ACCESS_FS (LANDLOCK_LAST_PUBLIC_ACCESS_FS << 4) #define LANDLOCK_MASK_ACCESS_FS ((LANDLOCK_LAST_ACCESS_FS << 1) - 1) #define LANDLOCK_NUM_ACCESS_FS __const_hweight64(LANDLOCK_MASK_ACCESS_FS) #define LANDLOCK_SHIFT_ACCESS_FS 0 diff --git a/security/landlock/ruleset.h b/security/landlock/ruleset.h index c7f1526784fd10..5a28ea8e1c3d50 100644 --- a/security/landlock/ruleset.h +++ b/security/landlock/ruleset.h @@ -30,7 +30,7 @@ LANDLOCK_ACCESS_FS_REFER) /* clang-format on */ -typedef u16 access_mask_t; +typedef u32 access_mask_t; /* Makes sure all filesystem access rights can be stored. */ static_assert(BITS_PER_TYPE(access_mask_t) >= LANDLOCK_NUM_ACCESS_FS); /* Makes sure all network access rights can be stored. */ diff --git a/security/landlock/syscalls.c b/security/landlock/syscalls.c index 898358f57fa085..f0bc50003b4684 100644 --- a/security/landlock/syscalls.c +++ b/security/landlock/syscalls.c @@ -137,7 +137,7 @@ static const struct file_operations ruleset_fops = { .write = fop_dummy_write, }; -#define LANDLOCK_ABI_VERSION 4 +#define LANDLOCK_ABI_VERSION 5 /** * sys_landlock_create_ruleset - Create a new ruleset @@ -192,8 +192,8 @@ SYSCALL_DEFINE3(landlock_create_ruleset, return err; /* Checks content (and 32-bits cast). */ - if ((ruleset_attr.handled_access_fs | LANDLOCK_MASK_ACCESS_FS) != - LANDLOCK_MASK_ACCESS_FS) + if ((ruleset_attr.handled_access_fs | LANDLOCK_MASK_PUBLIC_ACCESS_FS) != + LANDLOCK_MASK_PUBLIC_ACCESS_FS) return -EINVAL; /* Checks network content (and 32-bits cast). */ @@ -201,6 +201,10 @@ SYSCALL_DEFINE3(landlock_create_ruleset, LANDLOCK_MASK_ACCESS_NET) return -EINVAL; + /* Expands synthetic IOCTL groups. */ + ruleset_attr.handled_access_fs = landlock_expand_handled_access_fs( + ruleset_attr.handled_access_fs); + /* Checks arguments and transforms to kernel struct. */ ruleset = landlock_create_ruleset(ruleset_attr.handled_access_fs, ruleset_attr.handled_access_net); @@ -309,8 +313,13 @@ static int add_rule_path_beneath(struct landlock_ruleset *const ruleset, if (!path_beneath_attr.allowed_access) return -ENOMSG; - /* Checks that allowed_access matches the @ruleset constraints. */ - mask = landlock_get_raw_fs_access_mask(ruleset, 0); + /* + * Checks that allowed_access matches the @ruleset constraints and only + * consists of publicly visible access rights (as opposed to synthetic + * ones). + */ + mask = landlock_get_raw_fs_access_mask(ruleset, 0) & + LANDLOCK_MASK_PUBLIC_ACCESS_FS; if ((path_beneath_attr.allowed_access | mask) != mask) return -EINVAL; diff --git a/tools/testing/selftests/landlock/base_test.c b/tools/testing/selftests/landlock/base_test.c index 646f778dfb1eee..d292b419ccba40 100644 --- a/tools/testing/selftests/landlock/base_test.c +++ b/tools/testing/selftests/landlock/base_test.c @@ -75,7 +75,7 @@ TEST(abi_version) const struct landlock_ruleset_attr ruleset_attr = { .handled_access_fs = LANDLOCK_ACCESS_FS_READ_FILE, }; - ASSERT_EQ(4, landlock_create_ruleset(NULL, 0, + ASSERT_EQ(5, landlock_create_ruleset(NULL, 0, LANDLOCK_CREATE_RULESET_VERSION)); ASSERT_EQ(-1, landlock_create_ruleset(&ruleset_attr, 0, diff --git a/tools/testing/selftests/landlock/fs_test.c b/tools/testing/selftests/landlock/fs_test.c index 2d6d9b43d958cf..3203f4a5bc8595 100644 --- a/tools/testing/selftests/landlock/fs_test.c +++ b/tools/testing/selftests/landlock/fs_test.c @@ -527,9 +527,10 @@ TEST_F_FORK(layout1, inval) LANDLOCK_ACCESS_FS_EXECUTE | \ LANDLOCK_ACCESS_FS_WRITE_FILE | \ LANDLOCK_ACCESS_FS_READ_FILE | \ - LANDLOCK_ACCESS_FS_TRUNCATE) + LANDLOCK_ACCESS_FS_TRUNCATE | \ + LANDLOCK_ACCESS_FS_IOCTL) -#define ACCESS_LAST LANDLOCK_ACCESS_FS_TRUNCATE +#define ACCESS_LAST LANDLOCK_ACCESS_FS_IOCTL #define ACCESS_ALL ( \ ACCESS_FILE | \ From e14aeb6cda036f78bd6a1eba2eb04b6d8beb2814 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Tue, 18 Oct 2022 16:23:53 -0700 Subject: [PATCH 299/707] ARM: brcmstb: Add debug UART entry for 74165 BCM74165 uses the same address map as the 7278 family (v7 memory map) therefore re-use that constant and shit down the other labels to keep numerical ordering. Signed-off-by: Florian Fainelli --- arch/arm/include/debug/brcmstb.S | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/arm/include/debug/brcmstb.S b/arch/arm/include/debug/brcmstb.S index f6175e6e28cd22..3f7d68740ed4c9 100644 --- a/arch/arm/include/debug/brcmstb.S +++ b/arch/arm/include/debug/brcmstb.S @@ -27,6 +27,7 @@ #define UARTA_72165 UARTA_7278 #define UARTA_7364 REG_PHYS_ADDR(0x40b000) #define UARTA_7366 UARTA_7364 +#define UARTA_74165 UARTA_7278 #define UARTA_74371 REG_PHYS_ADDR(0x406b00) #define UARTA_7439 REG_PHYS_ADDR(0x40a900) #define UARTA_7445 REG_PHYS_ADDR(0x40ab00) @@ -88,9 +89,10 @@ ARM_BE8( rev \rv, \rv ) 30: checkuart(\rp, \rv, 0x72780000, 7278) 31: checkuart(\rp, \rv, 0x73640000, 7364) 32: checkuart(\rp, \rv, 0x73660000, 7366) -33: checkuart(\rp, \rv, 0x07437100, 74371) -34: checkuart(\rp, \rv, 0x74390000, 7439) -35: checkuart(\rp, \rv, 0x74450000, 7445) +33: checkuart(\rp, \rv, 0x07416500, 74165) +34: checkuart(\rp, \rv, 0x07437100, 74371) +35: checkuart(\rp, \rv, 0x74390000, 7439) +36: checkuart(\rp, \rv, 0x74450000, 7445) /* No valid UART found */ 90: mov \rp, #0 From cd3ac0d15df74727044a15e6c6954d9a11a59860 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 29 Jan 2024 19:27:40 +0800 Subject: [PATCH 300/707] f2fs: zone: fix to wait completion of last bio in zone correctly It needs to check last zone_pending_bio and wait IO completion before traverse next fio in io->io_list, otherwise, bio in next zone may be submitted before all IO completion in current zone. Fixes: e067dc3c6b9c ("f2fs: maintain six open zones for zoned devices") Cc: Daeho Jeong Signed-off-by: Chao Yu Reviewed-by: Daeho Jeong Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 65fe48bb17d16b..ac82e69a9f5fda 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1010,7 +1010,7 @@ void f2fs_submit_page_write(struct f2fs_io_info *fio) f2fs_bug_on(sbi, is_read_io(fio->op)); f2fs_down_write(&io->io_rwsem); - +next: #ifdef CONFIG_BLK_DEV_ZONED if (f2fs_sb_has_blkzoned(sbi) && btype < META && io->zone_pending_bio) { wait_for_completion_io(&io->zone_wait); @@ -1020,7 +1020,6 @@ void f2fs_submit_page_write(struct f2fs_io_info *fio) } #endif -next: if (fio->in_list) { spin_lock(&io->io_lock); if (list_empty(&io->io_list)) { From 52434fdc4f86c261e986e7b24035b5ab20d32145 Mon Sep 17 00:00:00 2001 From: Wenjie Qi Date: Tue, 16 Jan 2024 22:11:38 +0800 Subject: [PATCH 301/707] f2fs: fix NULL pointer dereference in f2fs_submit_page_write() BUG: kernel NULL pointer dereference, address: 0000000000000014 RIP: 0010:f2fs_submit_page_write+0x6cf/0x780 [f2fs] Call Trace: ? show_regs+0x6e/0x80 ? __die+0x29/0x70 ? page_fault_oops+0x154/0x4a0 ? prb_read_valid+0x20/0x30 ? __irq_work_queue_local+0x39/0xd0 ? irq_work_queue+0x36/0x70 ? do_user_addr_fault+0x314/0x6c0 ? exc_page_fault+0x7d/0x190 ? asm_exc_page_fault+0x2b/0x30 ? f2fs_submit_page_write+0x6cf/0x780 [f2fs] ? f2fs_submit_page_write+0x736/0x780 [f2fs] do_write_page+0x50/0x170 [f2fs] f2fs_outplace_write_data+0x61/0xb0 [f2fs] f2fs_do_write_data_page+0x3f8/0x660 [f2fs] f2fs_write_single_data_page+0x5bb/0x7a0 [f2fs] f2fs_write_cache_pages+0x3da/0xbe0 [f2fs] ... It is possible that other threads have added this fio to io->bio and submitted the io->bio before entering f2fs_submit_page_write(). At this point io->bio = NULL. If is_end_zone_blkaddr(sbi, fio->new_blkaddr) of this fio is true, then an NULL pointer dereference error occurs at bio_get(io->bio). The original code for determining zone end was after "out:", which would have missed some fio who is zone end. I've moved this code before "skip:" to make sure it's done for each fio. Fixes: e067dc3c6b9c ("f2fs: maintain six open zones for zoned devices") Signed-off-by: Wenjie Qi Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index ac82e69a9f5fda..05158f89ef32db 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1080,10 +1080,6 @@ void f2fs_submit_page_write(struct f2fs_io_info *fio) io->last_block_in_bio = fio->new_blkaddr; trace_f2fs_submit_page_write(fio->page, fio); -skip: - if (fio->in_list) - goto next; -out: #ifdef CONFIG_BLK_DEV_ZONED if (f2fs_sb_has_blkzoned(sbi) && btype < META && is_end_zone_blkaddr(sbi, fio->new_blkaddr)) { @@ -1096,6 +1092,10 @@ void f2fs_submit_page_write(struct f2fs_io_info *fio) __submit_merged_bio(io); } #endif +skip: + if (fio->in_list) + goto next; +out: if (is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN) || !f2fs_is_checkpoint_ready(sbi)) __submit_merged_bio(io); From ed4c142b8c136481123f4ee140164e5023a536ac Mon Sep 17 00:00:00 2001 From: David Lechner Date: Wed, 10 Jan 2024 14:28:41 -0600 Subject: [PATCH 302/707] iio: adc: ad7380: new driver for AD7380 ADCs This adds a new driver for the AD7380 family ADCs. The driver currently implements basic support for the AD7380, AD7381, AD7383, and AD7384 2-channel differential ADCs. Support for additional single-ended and 4-channel chips that use the same register map as well as additional features of the chip will be added in future patches. Co-developed-by: Stefan Popa Signed-off-by: Stefan Popa Reviewed-by: Nuno Sa Signed-off-by: David Lechner Link: https://lore.kernel.org/r/20240110-ad7380-mainline-v4-2-93a1d96b50fa@baylibre.com Signed-off-by: Jonathan Cameron --- MAINTAINERS | 1 + drivers/iio/adc/Kconfig | 16 ++ drivers/iio/adc/Makefile | 1 + drivers/iio/adc/ad7380.c | 462 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 480 insertions(+) create mode 100644 drivers/iio/adc/ad7380.c diff --git a/MAINTAINERS b/MAINTAINERS index da83fdae811b79..c1a7f85fdbacfe 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -435,6 +435,7 @@ S: Supported W: https://wiki.analog.com/resources/tools-software/linux-drivers/iio-adc/ad738x W: https://ez.analog.com/linux-software-drivers F: Documentation/devicetree/bindings/iio/adc/adi,ad7380.yaml +F: drivers/iio/adc/ad7380.c AD7877 TOUCHSCREEN DRIVER M: Michael Hennerich diff --git a/drivers/iio/adc/Kconfig b/drivers/iio/adc/Kconfig index 3b73c509bd68ef..59ae1d17b50d44 100644 --- a/drivers/iio/adc/Kconfig +++ b/drivers/iio/adc/Kconfig @@ -138,6 +138,22 @@ config AD7298 To compile this driver as a module, choose M here: the module will be called ad7298. +config AD7380 + tristate "Analog Devices AD7380 ADC driver" + depends on SPI_MASTER + select IIO_BUFFER + select IIO_TRIGGER + select IIO_TRIGGERED_BUFFER + help + AD7380 is a family of simultaneous sampling ADCs that share the same + SPI register map and have similar pinouts. + + Say yes here to build support for Analog Devices AD7380 ADC and + similar chips. + + To compile this driver as a module, choose M here: the module will be + called ad7380. + config AD7476 tristate "Analog Devices AD7476 1-channel ADCs driver and other similar devices from AD and TI" depends on SPI diff --git a/drivers/iio/adc/Makefile b/drivers/iio/adc/Makefile index d2fda54a3259c9..5a26ab6f110952 100644 --- a/drivers/iio/adc/Makefile +++ b/drivers/iio/adc/Makefile @@ -18,6 +18,7 @@ obj-$(CONFIG_AD7291) += ad7291.o obj-$(CONFIG_AD7292) += ad7292.o obj-$(CONFIG_AD7298) += ad7298.o obj-$(CONFIG_AD7923) += ad7923.o +obj-$(CONFIG_AD7380) += ad7380.o obj-$(CONFIG_AD7476) += ad7476.o obj-$(CONFIG_AD7606_IFACE_PARALLEL) += ad7606_par.o obj-$(CONFIG_AD7606_IFACE_SPI) += ad7606_spi.o diff --git a/drivers/iio/adc/ad7380.c b/drivers/iio/adc/ad7380.c new file mode 100644 index 00000000000000..44b8b18ab213ac --- /dev/null +++ b/drivers/iio/adc/ad7380.c @@ -0,0 +1,462 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Analog Devices AD738x Simultaneous Sampling SAR ADCs + * + * Copyright 2017 Analog Devices Inc. + * Copyright 2023 BayLibre, SAS + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +/* 2.5V internal reference voltage */ +#define AD7380_INTERNAL_REF_MV 2500 + +/* reading and writing registers is more reliable at lower than max speed */ +#define AD7380_REG_WR_SPEED_HZ 10000000 + +#define AD7380_REG_WR BIT(15) +#define AD7380_REG_REGADDR GENMASK(14, 12) +#define AD7380_REG_DATA GENMASK(11, 0) + +#define AD7380_REG_ADDR_NOP 0x0 +#define AD7380_REG_ADDR_CONFIG1 0x1 +#define AD7380_REG_ADDR_CONFIG2 0x2 +#define AD7380_REG_ADDR_ALERT 0x3 +#define AD7380_REG_ADDR_ALERT_LOW_TH 0x4 +#define AD7380_REG_ADDR_ALERT_HIGH_TH 0x5 + +#define AD7380_CONFIG1_OS_MODE BIT(9) +#define AD7380_CONFIG1_OSR GENMASK(8, 6) +#define AD7380_CONFIG1_CRC_W BIT(5) +#define AD7380_CONFIG1_CRC_R BIT(4) +#define AD7380_CONFIG1_ALERTEN BIT(3) +#define AD7380_CONFIG1_RES BIT(2) +#define AD7380_CONFIG1_REFSEL BIT(1) +#define AD7380_CONFIG1_PMODE BIT(0) + +#define AD7380_CONFIG2_SDO2 GENMASK(9, 8) +#define AD7380_CONFIG2_SDO BIT(8) +#define AD7380_CONFIG2_RESET GENMASK(7, 0) + +#define AD7380_CONFIG2_RESET_SOFT 0x3C +#define AD7380_CONFIG2_RESET_HARD 0xFF + +#define AD7380_ALERT_LOW_TH GENMASK(11, 0) +#define AD7380_ALERT_HIGH_TH GENMASK(11, 0) + +struct ad7380_chip_info { + const char *name; + const struct iio_chan_spec *channels; + unsigned int num_channels; +}; + +#define AD7380_DIFFERENTIAL_CHANNEL(index, bits) { \ + .type = IIO_VOLTAGE, \ + .info_mask_separate = BIT(IIO_CHAN_INFO_RAW), \ + .info_mask_shared_by_type = BIT(IIO_CHAN_INFO_SCALE), \ + .indexed = 1, \ + .differential = 1, \ + .channel = 2 * (index), \ + .channel2 = 2 * (index) + 1, \ + .scan_index = (index), \ + .scan_type = { \ + .sign = 's', \ + .realbits = (bits), \ + .storagebits = 16, \ + .endianness = IIO_CPU, \ + }, \ +} + +#define DEFINE_AD7380_DIFFERENTIAL_2_CHANNEL(name, bits) \ +static const struct iio_chan_spec name[] = { \ + AD7380_DIFFERENTIAL_CHANNEL(0, bits), \ + AD7380_DIFFERENTIAL_CHANNEL(1, bits), \ + IIO_CHAN_SOFT_TIMESTAMP(2), \ +} + +/* fully differential */ +DEFINE_AD7380_DIFFERENTIAL_2_CHANNEL(ad7380_channels, 16); +DEFINE_AD7380_DIFFERENTIAL_2_CHANNEL(ad7381_channels, 14); +/* pseudo differential */ +DEFINE_AD7380_DIFFERENTIAL_2_CHANNEL(ad7383_channels, 16); +DEFINE_AD7380_DIFFERENTIAL_2_CHANNEL(ad7384_channels, 14); + +/* Since this is simultaneous sampling, we don't allow individual channels. */ +static const unsigned long ad7380_2_channel_scan_masks[] = { + GENMASK(1, 0), + 0 +}; + +static const struct ad7380_chip_info ad7380_chip_info = { + .name = "ad7380", + .channels = ad7380_channels, + .num_channels = ARRAY_SIZE(ad7380_channels), +}; + +static const struct ad7380_chip_info ad7381_chip_info = { + .name = "ad7381", + .channels = ad7381_channels, + .num_channels = ARRAY_SIZE(ad7381_channels), +}; + +static const struct ad7380_chip_info ad7383_chip_info = { + .name = "ad7383", + .channels = ad7383_channels, + .num_channels = ARRAY_SIZE(ad7383_channels), +}; + +static const struct ad7380_chip_info ad7384_chip_info = { + .name = "ad7384", + .channels = ad7384_channels, + .num_channels = ARRAY_SIZE(ad7384_channels), +}; + +struct ad7380_state { + const struct ad7380_chip_info *chip_info; + struct spi_device *spi; + struct regulator *vref; + struct regmap *regmap; + /* + * DMA (thus cache coherency maintenance) requires the + * transfer buffers to live in their own cache lines. + * Make the buffer large enough for 2 16-bit samples and one 64-bit + * aligned 64 bit timestamp. + */ + struct { + u16 raw[2]; + s64 ts __aligned(8); + } scan_data __aligned(IIO_DMA_MINALIGN); + u16 tx[2]; + u16 rx[2]; +}; + +static int ad7380_regmap_reg_write(void *context, unsigned int reg, + unsigned int val) +{ + struct ad7380_state *st = context; + struct spi_transfer xfer = { + .speed_hz = AD7380_REG_WR_SPEED_HZ, + .bits_per_word = 16, + .len = 2, + .tx_buf = &st->tx[0], + }; + + st->tx[0] = FIELD_PREP(AD7380_REG_WR, 1) | + FIELD_PREP(AD7380_REG_REGADDR, reg) | + FIELD_PREP(AD7380_REG_DATA, val); + + return spi_sync_transfer(st->spi, &xfer, 1); +} + +static int ad7380_regmap_reg_read(void *context, unsigned int reg, + unsigned int *val) +{ + struct ad7380_state *st = context; + struct spi_transfer xfers[] = { + { + .speed_hz = AD7380_REG_WR_SPEED_HZ, + .bits_per_word = 16, + .len = 2, + .tx_buf = &st->tx[0], + .cs_change = 1, + .cs_change_delay = { + .value = 10, /* t[CSH] */ + .unit = SPI_DELAY_UNIT_NSECS, + }, + }, { + .speed_hz = AD7380_REG_WR_SPEED_HZ, + .bits_per_word = 16, + .len = 2, + .rx_buf = &st->rx[0], + }, + }; + int ret; + + st->tx[0] = FIELD_PREP(AD7380_REG_WR, 0) | + FIELD_PREP(AD7380_REG_REGADDR, reg) | + FIELD_PREP(AD7380_REG_DATA, 0); + + ret = spi_sync_transfer(st->spi, xfers, ARRAY_SIZE(xfers)); + if (ret < 0) + return ret; + + *val = FIELD_GET(AD7380_REG_DATA, st->rx[0]); + + return 0; +} + +static const struct regmap_config ad7380_regmap_config = { + .reg_bits = 3, + .val_bits = 12, + .reg_read = ad7380_regmap_reg_read, + .reg_write = ad7380_regmap_reg_write, + .max_register = AD7380_REG_ADDR_ALERT_HIGH_TH, + .can_sleep = true, +}; + +static int ad7380_debugfs_reg_access(struct iio_dev *indio_dev, u32 reg, + u32 writeval, u32 *readval) +{ + struct ad7380_state *st = iio_priv(indio_dev); + int ret; + + ret = iio_device_claim_direct_mode(indio_dev); + if (ret) + return ret; + + if (readval) + ret = regmap_read(st->regmap, reg, readval); + else + ret = regmap_write(st->regmap, reg, writeval); + + iio_device_release_direct_mode(indio_dev); + + return ret; +} + +static irqreturn_t ad7380_trigger_handler(int irq, void *p) +{ + struct iio_poll_func *pf = p; + struct iio_dev *indio_dev = pf->indio_dev; + struct ad7380_state *st = iio_priv(indio_dev); + struct spi_transfer xfer = { + .bits_per_word = st->chip_info->channels[0].scan_type.realbits, + .len = 4, + .rx_buf = st->scan_data.raw, + }; + int ret; + + ret = spi_sync_transfer(st->spi, &xfer, 1); + if (ret) + goto out; + + iio_push_to_buffers_with_timestamp(indio_dev, &st->scan_data, + pf->timestamp); + +out: + iio_trigger_notify_done(indio_dev->trig); + + return IRQ_HANDLED; +} + +static int ad7380_read_direct(struct ad7380_state *st, + struct iio_chan_spec const *chan, int *val) +{ + struct spi_transfer xfers[] = { + /* toggle CS (no data xfer) to trigger a conversion */ + { + .speed_hz = AD7380_REG_WR_SPEED_HZ, + .bits_per_word = chan->scan_type.realbits, + .delay = { + .value = 190, /* t[CONVERT] */ + .unit = SPI_DELAY_UNIT_NSECS, + }, + .cs_change = 1, + .cs_change_delay = { + .value = 10, /* t[CSH] */ + .unit = SPI_DELAY_UNIT_NSECS, + }, + }, + /* then read both channels */ + { + .speed_hz = AD7380_REG_WR_SPEED_HZ, + .bits_per_word = chan->scan_type.realbits, + .rx_buf = &st->rx[0], + .len = 4, + }, + }; + int ret; + + ret = spi_sync_transfer(st->spi, xfers, ARRAY_SIZE(xfers)); + if (ret < 0) + return ret; + + *val = sign_extend32(st->rx[chan->scan_index], + chan->scan_type.realbits - 1); + + return IIO_VAL_INT; +} + +static int ad7380_read_raw(struct iio_dev *indio_dev, + struct iio_chan_spec const *chan, + int *val, int *val2, long info) +{ + struct ad7380_state *st = iio_priv(indio_dev); + int ret; + + switch (info) { + case IIO_CHAN_INFO_RAW: + ret = iio_device_claim_direct_mode(indio_dev); + if (ret) + return ret; + + ret = ad7380_read_direct(st, chan, val); + iio_device_release_direct_mode(indio_dev); + + return ret; + case IIO_CHAN_INFO_SCALE: + if (st->vref) { + ret = regulator_get_voltage(st->vref); + if (ret < 0) + return ret; + + *val = ret / 1000; + } else { + *val = AD7380_INTERNAL_REF_MV; + } + + *val2 = chan->scan_type.realbits; + + return IIO_VAL_FRACTIONAL_LOG2; + } + + return -EINVAL; +} + +static const struct iio_info ad7380_info = { + .read_raw = &ad7380_read_raw, + .debugfs_reg_access = &ad7380_debugfs_reg_access, +}; + +static int ad7380_init(struct ad7380_state *st) +{ + int ret; + + /* perform hard reset */ + ret = regmap_update_bits(st->regmap, AD7380_REG_ADDR_CONFIG2, + AD7380_CONFIG2_RESET, + FIELD_PREP(AD7380_CONFIG2_RESET, + AD7380_CONFIG2_RESET_HARD)); + if (ret < 0) + return ret; + + /* select internal or external reference voltage */ + ret = regmap_update_bits(st->regmap, AD7380_REG_ADDR_CONFIG1, + AD7380_CONFIG1_REFSEL, + FIELD_PREP(AD7380_CONFIG1_REFSEL, !!st->vref)); + if (ret < 0) + return ret; + + /* SPI 1-wire mode */ + return regmap_update_bits(st->regmap, AD7380_REG_ADDR_CONFIG2, + AD7380_CONFIG2_SDO, + FIELD_PREP(AD7380_CONFIG2_SDO, 1)); +} + +static void ad7380_regulator_disable(void *p) +{ + regulator_disable(p); +} + +static int ad7380_probe(struct spi_device *spi) +{ + struct iio_dev *indio_dev; + struct ad7380_state *st; + int ret; + + indio_dev = devm_iio_device_alloc(&spi->dev, sizeof(*st)); + if (!indio_dev) + return -ENOMEM; + + st = iio_priv(indio_dev); + st->spi = spi; + st->chip_info = spi_get_device_match_data(spi); + if (!st->chip_info) + return dev_err_probe(&spi->dev, -EINVAL, "missing match data\n"); + + st->vref = devm_regulator_get_optional(&spi->dev, "refio"); + if (IS_ERR(st->vref)) { + /* + * If there is no REFIO supply, then it means that we are using + * the internal 2.5V reference. + */ + if (PTR_ERR(st->vref) == -ENODEV) + st->vref = NULL; + else + return dev_err_probe(&spi->dev, PTR_ERR(st->vref), + "Failed to get refio regulator\n"); + } + + if (st->vref) { + ret = regulator_enable(st->vref); + if (ret) + return ret; + + ret = devm_add_action_or_reset(&spi->dev, ad7380_regulator_disable, + st->vref); + if (ret) + return ret; + } + + st->regmap = devm_regmap_init(&spi->dev, NULL, st, &ad7380_regmap_config); + if (IS_ERR(st->regmap)) + return dev_err_probe(&spi->dev, PTR_ERR(st->regmap), + "failed to allocate register map\n"); + + indio_dev->channels = st->chip_info->channels; + indio_dev->num_channels = st->chip_info->num_channels; + indio_dev->name = st->chip_info->name; + indio_dev->info = &ad7380_info; + indio_dev->modes = INDIO_DIRECT_MODE; + indio_dev->available_scan_masks = ad7380_2_channel_scan_masks; + + ret = devm_iio_triggered_buffer_setup(&spi->dev, indio_dev, + iio_pollfunc_store_time, + ad7380_trigger_handler, NULL); + if (ret) + return ret; + + ret = ad7380_init(st); + if (ret) + return ret; + + return devm_iio_device_register(&spi->dev, indio_dev); +} + +static const struct of_device_id ad7380_of_match_table[] = { + { .compatible = "adi,ad7380", .data = &ad7380_chip_info }, + { .compatible = "adi,ad7381", .data = &ad7381_chip_info }, + { .compatible = "adi,ad7383", .data = &ad7383_chip_info }, + { .compatible = "adi,ad7384", .data = &ad7384_chip_info }, + { } +}; + +static const struct spi_device_id ad7380_id_table[] = { + { "ad7380", (kernel_ulong_t)&ad7380_chip_info }, + { "ad7381", (kernel_ulong_t)&ad7381_chip_info }, + { "ad7383", (kernel_ulong_t)&ad7383_chip_info }, + { "ad7384", (kernel_ulong_t)&ad7384_chip_info }, + { } +}; +MODULE_DEVICE_TABLE(spi, ad7380_id_table); + +static struct spi_driver ad7380_driver = { + .driver = { + .name = "ad7380", + .of_match_table = ad7380_of_match_table, + }, + .probe = ad7380_probe, + .id_table = ad7380_id_table, +}; +module_spi_driver(ad7380_driver); + +MODULE_AUTHOR("Stefan Popa "); +MODULE_DESCRIPTION("Analog Devices AD738x ADC driver"); +MODULE_LICENSE("GPL"); From 4fef5d3feab7c7bdebec58f26d2856a8734fec50 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 20 Jan 2024 21:50:05 -0800 Subject: [PATCH 303/707] iio: dummy_evgen: remove Excess kernel-doc comments Drop kernel-doc comments for struct fields that were removed to prevent kernel-doc warnings: iio_dummy_evgen.c:43: warning: Excess struct member 'irq_sim' description in 'iio_dummy_eventgen' iio_dummy_evgen.c:43: warning: Excess struct member 'base' description in 'iio_dummy_eventgen' Fixes: 337cbeb2c13e ("genirq/irq_sim: Simplify the API") Signed-off-by: Randy Dunlap Reviewed-by: Bartosz Golaszewski Link: https://lore.kernel.org/r/20240121055005.20042-1-rdunlap@infradead.org Signed-off-by: Jonathan Cameron --- drivers/iio/dummy/iio_dummy_evgen.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/iio/dummy/iio_dummy_evgen.c b/drivers/iio/dummy/iio_dummy_evgen.c index 5a0072727ba4b1..16d3f144dda040 100644 --- a/drivers/iio/dummy/iio_dummy_evgen.c +++ b/drivers/iio/dummy/iio_dummy_evgen.c @@ -31,8 +31,6 @@ * @regs: irq regs we are faking * @lock: protect the evgen state * @inuse: mask of which irqs are connected - * @irq_sim: interrupt simulator - * @base: base of irq range * @irq_sim_domain: irq simulator domain */ struct iio_dummy_eventgen { From 34c9ee68cd4c33921471fc956476255fd6ab8b86 Mon Sep 17 00:00:00 2001 From: Nuno Sa Date: Wed, 17 Jan 2024 14:10:50 +0100 Subject: [PATCH 304/707] iio: imu: adis16475: make use of irq_get_trigger_type() There's no need to call both irq_get_irq_data() and irqd_get_trigger_type() as we already have an helper for that. This allows for code simplification. Signed-off-by: Nuno Sa Link: https://lore.kernel.org/r/20240117-adis-improv-v1-2-7f90e9fad200@analog.com Signed-off-by: Jonathan Cameron --- drivers/iio/imu/adis16475.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/drivers/iio/imu/adis16475.c b/drivers/iio/imu/adis16475.c index 64be656f0b8052..01f55cc902faad 100644 --- a/drivers/iio/imu/adis16475.c +++ b/drivers/iio/imu/adis16475.c @@ -1363,22 +1363,16 @@ static int adis16475_config_sync_mode(struct adis16475 *st) static int adis16475_config_irq_pin(struct adis16475 *st) { int ret; - struct irq_data *desc; u32 irq_type; u16 val = 0; u8 polarity; struct spi_device *spi = st->adis.spi; - desc = irq_get_irq_data(spi->irq); - if (!desc) { - dev_err(&spi->dev, "Could not find IRQ %d\n", spi->irq); - return -EINVAL; - } /* * It is possible to configure the data ready polarity. Furthermore, we * need to update the adis struct if we want data ready as active low. */ - irq_type = irqd_get_trigger_type(desc); + irq_type = irq_get_trigger_type(spi->irq); if (irq_type == IRQ_TYPE_EDGE_RISING) { polarity = 1; st->adis.irq_flag = IRQF_TRIGGER_RISING; From 8ed2aa0a1b56695989b1fd5677d2d66d5e6d2406 Mon Sep 17 00:00:00 2001 From: Nuno Sa Date: Wed, 17 Jan 2024 14:10:51 +0100 Subject: [PATCH 305/707] iio: imu: adis16480: make use of irq_get_trigger_type() There's no need to call both irq_get_irq_data() and irqd_get_trigger_type() as we already have an helper for that. This allows for code simplification. Signed-off-by: Nuno Sa Link: https://lore.kernel.org/r/20240117-adis-improv-v1-3-7f90e9fad200@analog.com Signed-off-by: Jonathan Cameron --- drivers/iio/imu/adis16480.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/drivers/iio/imu/adis16480.c b/drivers/iio/imu/adis16480.c index fe520194a83717..b40a55bba30c19 100644 --- a/drivers/iio/imu/adis16480.c +++ b/drivers/iio/imu/adis16480.c @@ -1246,18 +1246,11 @@ static int adis16480_config_irq_pin(struct adis16480 *st) { struct device *dev = &st->adis.spi->dev; struct fwnode_handle *fwnode = dev_fwnode(dev); - struct irq_data *desc; enum adis16480_int_pin pin; unsigned int irq_type; uint16_t val; int i, irq = 0; - desc = irq_get_irq_data(st->adis.spi->irq); - if (!desc) { - dev_err(dev, "Could not find IRQ %d\n", irq); - return -EINVAL; - } - /* Disable data ready since the default after reset is on */ val = ADIS16480_DRDY_EN(0); @@ -1285,7 +1278,7 @@ static int adis16480_config_irq_pin(struct adis16480 *st) * configured as positive or negative, corresponding to * IRQ_TYPE_EDGE_RISING or IRQ_TYPE_EDGE_FALLING respectively. */ - irq_type = irqd_get_trigger_type(desc); + irq_type = irq_get_trigger_type(st->adis.spi->irq); if (irq_type == IRQ_TYPE_EDGE_RISING) { /* Default */ val |= ADIS16480_DRDY_POL(1); } else if (irq_type == IRQ_TYPE_EDGE_FALLING) { From 8ad16754891d0ea6654126515603c3c389d446dd Mon Sep 17 00:00:00 2001 From: Nuno Sa Date: Wed, 17 Jan 2024 13:41:04 +0100 Subject: [PATCH 306/707] iio: adc: ad_sigma_delta: allow overwriting the IRQ flags Make sure we can specify the IRQ trigger type from firmware and drivers won't ignore it. In fact, this how it should be done but since someone might be already depending on the driver to hardcode the trigger type (and not specifying it in firmware), let's do it like this so there's no possible breakage. Signed-off-by: Nuno Sa Link: https://lore.kernel.org/r/20240117-dev_sigma_delta_no_irq_flags-v1-2-db39261592cf@analog.com Signed-off-by: Jonathan Cameron --- drivers/iio/adc/ad_sigma_delta.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/iio/adc/ad_sigma_delta.c b/drivers/iio/adc/ad_sigma_delta.c index 7e21928707437b..fbba3f4a118983 100644 --- a/drivers/iio/adc/ad_sigma_delta.c +++ b/drivers/iio/adc/ad_sigma_delta.c @@ -568,6 +568,7 @@ EXPORT_SYMBOL_NS_GPL(ad_sd_validate_trigger, IIO_AD_SIGMA_DELTA); static int devm_ad_sd_probe_trigger(struct device *dev, struct iio_dev *indio_dev) { struct ad_sigma_delta *sigma_delta = iio_device_get_drvdata(indio_dev); + unsigned long irq_flags = irq_get_trigger_type(sigma_delta->spi->irq); int ret; if (dev != &sigma_delta->spi->dev) { @@ -588,9 +589,13 @@ static int devm_ad_sd_probe_trigger(struct device *dev, struct iio_dev *indio_de /* the IRQ core clears IRQ_DISABLE_UNLAZY flag when freeing an IRQ */ irq_set_status_flags(sigma_delta->spi->irq, IRQ_DISABLE_UNLAZY); + /* Allow overwriting the flags from firmware */ + if (!irq_flags) + irq_flags = sigma_delta->info->irq_flags; + ret = devm_request_irq(dev, sigma_delta->spi->irq, ad_sd_data_rdy_trig_poll, - sigma_delta->info->irq_flags | IRQF_NO_AUTOEN, + irq_flags | IRQF_NO_AUTOEN, indio_dev->name, sigma_delta); if (ret) From 28065671fb19ccf932edbbc4e25f3e328079b0a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A5rten=20Lindahl?= Date: Mon, 15 Jan 2024 12:44:36 +0100 Subject: [PATCH 307/707] iio: light: vcnl4000: Set ps high definition for 4040/4200 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The vcnl4040/vcnl4200 proximity sensor defaults to 12 bit data resolution, but the chip also supports 16 bit data resolution, which is called proximity high definition (PS_HD). Make the vcnl4040/vcnl4200 proximity sensor use the high definition for all data readings. Please note that in order to preserve the 12 bit integer part of the in_proximity_raw output, the format is changed from integer to fixed point. Signed-off-by: Mårten Lindahl Link: https://lore.kernel.org/r/20231221-vcnl4000-ps-hd-v3-1-6dcc889372be@axis.com Signed-off-by: Jonathan Cameron --- drivers/iio/light/vcnl4000.c | 36 +++++++++++++++++++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/drivers/iio/light/vcnl4000.c b/drivers/iio/light/vcnl4000.c index fdf763a04b0bfa..4e3641ff2ed446 100644 --- a/drivers/iio/light/vcnl4000.c +++ b/drivers/iio/light/vcnl4000.c @@ -90,6 +90,7 @@ #define VCNL4040_PS_CONF1_PS_SHUTDOWN BIT(0) #define VCNL4040_PS_CONF2_PS_IT GENMASK(3, 1) /* Proximity integration time */ #define VCNL4040_CONF1_PS_PERS GENMASK(5, 4) /* Proximity interrupt persistence setting */ +#define VCNL4040_PS_CONF2_PS_HD BIT(11) /* Proximity high definition */ #define VCNL4040_PS_CONF2_PS_INT GENMASK(9, 8) /* Proximity interrupt mode */ #define VCNL4040_PS_CONF3_MPS GENMASK(6, 5) /* Proximity multi pulse number */ #define VCNL4040_PS_MS_LED_I GENMASK(10, 8) /* Proximity current */ @@ -114,6 +115,13 @@ #define VCNL4010_INT_DRDY \ (BIT(VCNL4010_INT_PROXIMITY) | BIT(VCNL4010_INT_ALS)) +#define VCNL4040_CONF3_PS_MPS_16BITS 3 /* 8 multi pulses */ +#define VCNL4040_CONF3_PS_LED_I_16BITS 3 /* 120 mA */ + +#define VCNL4040_CONF3_PS_SAMPLE_16BITS \ + (FIELD_PREP(VCNL4040_PS_CONF3_MPS, VCNL4040_CONF3_PS_MPS_16BITS) | \ + FIELD_PREP(VCNL4040_PS_MS_LED_I, VCNL4040_CONF3_PS_LED_I_16BITS)) + static const int vcnl4010_prox_sampling_frequency[][2] = { {1, 950000}, {3, 906250}, @@ -195,6 +203,7 @@ struct vcnl4000_data { enum vcnl4000_device_ids id; int rev; int al_scale; + int ps_scale; u8 ps_int; /* proximity interrupt mode */ u8 als_int; /* ambient light interrupt mode*/ const struct vcnl4000_chip_spec *chip_spec; @@ -345,6 +354,7 @@ static int vcnl4200_set_power_state(struct vcnl4000_data *data, bool on) static int vcnl4200_init(struct vcnl4000_data *data) { int ret, id; + u16 regval; ret = i2c_smbus_read_word_data(data->client, VCNL4200_DEV_ID); if (ret < 0) @@ -386,9 +396,32 @@ static int vcnl4200_init(struct vcnl4000_data *data) break; } data->al_scale = data->chip_spec->ulux_step; + data->ps_scale = 16; mutex_init(&data->vcnl4200_al.lock); mutex_init(&data->vcnl4200_ps.lock); + /* Use 16 bits proximity sensor readings */ + ret = i2c_smbus_read_word_data(data->client, VCNL4200_PS_CONF1); + if (ret < 0) + return ret; + + regval = ret | VCNL4040_PS_CONF2_PS_HD; + ret = i2c_smbus_write_word_data(data->client, VCNL4200_PS_CONF1, + regval); + if (ret < 0) + return ret; + + /* Align proximity sensor sample rate to 16 bits data width */ + ret = i2c_smbus_read_word_data(data->client, VCNL4200_PS_CONF3); + if (ret < 0) + return ret; + + regval = ret | VCNL4040_CONF3_PS_SAMPLE_16BITS; + ret = i2c_smbus_write_word_data(data->client, VCNL4200_PS_CONF3, + regval); + if (ret < 0) + return ret; + ret = data->chip_spec->set_power_state(data, true); if (ret < 0) return ret; @@ -901,8 +934,9 @@ static int vcnl4000_read_raw(struct iio_dev *indio_dev, break; case IIO_PROXIMITY: ret = data->chip_spec->measure_proximity(data, val); + *val2 = data->ps_scale; if (!ret) - ret = IIO_VAL_INT; + ret = IIO_VAL_FRACTIONAL; break; default: ret = -EINVAL; From 0ff5430cb47cc98bb0a8da65a1c9ea79b69145c6 Mon Sep 17 00:00:00 2001 From: Matti Vaittinen Date: Mon, 15 Jan 2024 16:26:07 +0200 Subject: [PATCH 308/707] iio: test: test gain-time-scale helpers Some light sensors can adjust both the HW-gain and integration time. There are cases where adjusting the integration time has similar impact to the scale of the reported values as gain setting has. IIO users do typically expect to handle scale by a single writable 'scale' entry. Driver should then adjust the gain/time accordingly. It however is difficult for a driver to know whether it should change gain or integration time to meet the requested scale. Usually it is preferred to have longer integration time which usually improves accuracy, but there may be use-cases where long measurement times can be an issue. Thus it can be preferable to allow also changing the integration time - but mitigate the scale impact by also changing the gain underneath. Eg, if integration time change doubles the measured values, the driver can reduce the HW-gain to half. The theory of the computations of gain-time-scale is simple. However, some people (undersigned) got that implemented wrong for more than once. Hence some gain-time-scale helpers were introduced. Add some simple tests to verify the most hairy functions. Signed-off-by: Matti Vaittinen Link: https://lore.kernel.org/r/0f7505b43f91394dc3bb636369489c897b7e01a7.1705328293.git.mazziesaccount@gmail.com Signed-off-by: Jonathan Cameron --- drivers/iio/test/Kconfig | 14 + drivers/iio/test/Makefile | 1 + drivers/iio/test/iio-test-gts.c | 513 ++++++++++++++++++++++++++++++++ 3 files changed, 528 insertions(+) create mode 100644 drivers/iio/test/iio-test-gts.c diff --git a/drivers/iio/test/Kconfig b/drivers/iio/test/Kconfig index 0b6e4e278a2f61..33cca49c8058ae 100644 --- a/drivers/iio/test/Kconfig +++ b/drivers/iio/test/Kconfig @@ -4,6 +4,20 @@ # # Keep in alphabetical order +config IIO_GTS_KUNIT_TEST + tristate "Test IIO formatting functions" if !KUNIT_ALL_TESTS + depends on KUNIT + select IIO_GTS_HELPER + select TEST_KUNIT_DEVICE_HELPERS + default KUNIT_ALL_TESTS + help + build unit tests for the IIO light sensor gain-time-scale helpers. + + For more information on KUnit and unit tests in general, please refer + to the KUnit documentation in Documentation/dev-tools/kunit/. + + If unsure, say N. Keep in alphabetical order + config IIO_RESCALE_KUNIT_TEST tristate "Test IIO rescale conversion functions" if !KUNIT_ALL_TESTS depends on KUNIT && IIO_RESCALE diff --git a/drivers/iio/test/Makefile b/drivers/iio/test/Makefile index d76eaf36da8206..e9a4cf1ff57f01 100644 --- a/drivers/iio/test/Makefile +++ b/drivers/iio/test/Makefile @@ -6,4 +6,5 @@ # Keep in alphabetical order obj-$(CONFIG_IIO_RESCALE_KUNIT_TEST) += iio-test-rescale.o obj-$(CONFIG_IIO_FORMAT_KUNIT_TEST) += iio-test-format.o +obj-$(CONFIG_IIO_GTS_KUNIT_TEST) += iio-test-gts.o CFLAGS_iio-test-format.o += $(DISABLE_STRUCTLEAK_PLUGIN) diff --git a/drivers/iio/test/iio-test-gts.c b/drivers/iio/test/iio-test-gts.c new file mode 100644 index 00000000000000..cf7ab773ea0b15 --- /dev/null +++ b/drivers/iio/test/iio-test-gts.c @@ -0,0 +1,513 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Unit tests for IIO light sensor gain-time-scale helpers + * + * Copyright (c) 2023 Matti Vaittinen + */ + +#include +#include +#include +#include +#include + +/* + * Please, read the "rant" from the top of the lib/test_linear_ranges.c if + * you see a line of helper code which is not being tested. + * + * Then, please look at the line which is not being tested. Is this line + * somehow unusually complex? If answer is "no", then chances are that the + * "development inertia" caused by adding a test exceeds the benefits. + * + * If yes, then adding a test is probably a good idea but please stop for a + * moment and consider the effort of changing all the tests when code gets + * refactored. Eventually it neeeds to be. + */ + +#define TEST_TSEL_50 1 +#define TEST_TSEL_X_MIN TEST_TSEL_50 +#define TEST_TSEL_100 0 +#define TEST_TSEL_200 2 +#define TEST_TSEL_400 4 +#define TEST_TSEL_X_MAX TEST_TSEL_400 + +#define TEST_GSEL_1 0x00 +#define TEST_GSEL_X_MIN TEST_GSEL_1 +#define TEST_GSEL_4 0x08 +#define TEST_GSEL_16 0x0a +#define TEST_GSEL_32 0x0b +#define TEST_GSEL_64 0x0c +#define TEST_GSEL_256 0x18 +#define TEST_GSEL_512 0x19 +#define TEST_GSEL_1024 0x1a +#define TEST_GSEL_2048 0x1b +#define TEST_GSEL_4096 0x1c +#define TEST_GSEL_X_MAX TEST_GSEL_4096 + +#define TEST_SCALE_1X 64 +#define TEST_SCALE_MIN_X TEST_SCALE_1X +#define TEST_SCALE_2X 32 +#define TEST_SCALE_4X 16 +#define TEST_SCALE_8X 8 +#define TEST_SCALE_16X 4 +#define TEST_SCALE_32X 2 +#define TEST_SCALE_64X 1 + +#define TEST_SCALE_NANO_128X 500000000 +#define TEST_SCALE_NANO_256X 250000000 +#define TEST_SCALE_NANO_512X 125000000 +#define TEST_SCALE_NANO_1024X 62500000 +#define TEST_SCALE_NANO_2048X 31250000 +#define TEST_SCALE_NANO_4096X 15625000 +#define TEST_SCALE_NANO_4096X2 7812500 +#define TEST_SCALE_NANO_4096X4 3906250 +#define TEST_SCALE_NANO_4096X8 1953125 + +#define TEST_SCALE_NANO_MAX_X TEST_SCALE_NANO_4096X8 + +/* + * Can't have this allocated from stack because the kunit clean-up will + * happen only after the test function has already gone + */ +static struct iio_gts gts; + +static const struct iio_gain_sel_pair gts_test_gains[] = { + GAIN_SCALE_GAIN(1, TEST_GSEL_1), + GAIN_SCALE_GAIN(4, TEST_GSEL_4), + GAIN_SCALE_GAIN(16, TEST_GSEL_16), + GAIN_SCALE_GAIN(32, TEST_GSEL_32), + GAIN_SCALE_GAIN(64, TEST_GSEL_64), + GAIN_SCALE_GAIN(256, TEST_GSEL_256), + GAIN_SCALE_GAIN(512, TEST_GSEL_512), + GAIN_SCALE_GAIN(1024, TEST_GSEL_1024), + GAIN_SCALE_GAIN(2048, TEST_GSEL_2048), + GAIN_SCALE_GAIN(4096, TEST_GSEL_4096), +#define HWGAIN_MAX 4096 +}; + +static const struct iio_itime_sel_mul gts_test_itimes[] = { + GAIN_SCALE_ITIME_US(400 * 1000, TEST_TSEL_400, 8), + GAIN_SCALE_ITIME_US(200 * 1000, TEST_TSEL_200, 4), + GAIN_SCALE_ITIME_US(100 * 1000, TEST_TSEL_100, 2), + GAIN_SCALE_ITIME_US(50 * 1000, TEST_TSEL_50, 1), +#define TIMEGAIN_MAX 8 +}; +#define TOTAL_GAIN_MAX (HWGAIN_MAX * TIMEGAIN_MAX) +#define IIO_GTS_TEST_DEV "iio-gts-test-dev" + +static struct device *__test_init_iio_gain_scale(struct kunit *test, + struct iio_gts *gts, const struct iio_gain_sel_pair *g_table, + int num_g, const struct iio_itime_sel_mul *i_table, int num_i) +{ + struct device *dev; + int ret; + + dev = kunit_device_register(test, IIO_GTS_TEST_DEV); + + KUNIT_EXPECT_NOT_ERR_OR_NULL(test, dev); + if (IS_ERR_OR_NULL(dev)) + return NULL; + + ret = devm_iio_init_iio_gts(dev, TEST_SCALE_1X, 0, g_table, num_g, + i_table, num_i, gts); + KUNIT_EXPECT_EQ(test, 0, ret); + if (ret) + return NULL; + + return dev; +} + +#define test_init_iio_gain_scale(test, gts) \ + __test_init_iio_gain_scale(test, gts, gts_test_gains, \ + ARRAY_SIZE(gts_test_gains), gts_test_itimes, \ + ARRAY_SIZE(gts_test_itimes)) + +static void test_init_iio_gts_invalid(struct kunit *test) +{ + struct device *dev; + int ret; + const struct iio_itime_sel_mul itimes_neg[] = { + GAIN_SCALE_ITIME_US(-10, TEST_TSEL_400, 8), + GAIN_SCALE_ITIME_US(200 * 1000, TEST_TSEL_200, 4), + }; + const struct iio_gain_sel_pair gains_neg[] = { + GAIN_SCALE_GAIN(1, TEST_GSEL_1), + GAIN_SCALE_GAIN(2, TEST_GSEL_4), + GAIN_SCALE_GAIN(-2, TEST_GSEL_16), + }; + /* 55555 * 38656 = 2147534080 => overflows 32bit int */ + const struct iio_itime_sel_mul itimes_overflow[] = { + GAIN_SCALE_ITIME_US(400 * 1000, TEST_TSEL_400, 55555), + GAIN_SCALE_ITIME_US(200 * 1000, TEST_TSEL_200, 4), + }; + const struct iio_gain_sel_pair gains_overflow[] = { + GAIN_SCALE_GAIN(1, TEST_GSEL_1), + GAIN_SCALE_GAIN(2, TEST_GSEL_4), + GAIN_SCALE_GAIN(38656, TEST_GSEL_16), + }; + + dev = kunit_device_register(test, IIO_GTS_TEST_DEV); + KUNIT_EXPECT_NOT_ERR_OR_NULL(test, dev); + if (!dev) + return; + + /* Ok gains, negative time */ + ret = devm_iio_init_iio_gts(dev, TEST_SCALE_1X, 0, gts_test_gains, + ARRAY_SIZE(gts_test_gains), itimes_neg, + ARRAY_SIZE(itimes_neg), >s); + KUNIT_EXPECT_EQ(test, -EINVAL, ret); + + /* Ok times, negative gain */ + ret = devm_iio_init_iio_gts(dev, TEST_SCALE_1X, 0, gains_neg, + ARRAY_SIZE(gains_neg), gts_test_itimes, + ARRAY_SIZE(gts_test_itimes), >s); + KUNIT_EXPECT_EQ(test, -EINVAL, ret); + + /* gain * time overflow int */ + ret = devm_iio_init_iio_gts(dev, TEST_SCALE_1X, 0, gains_overflow, + ARRAY_SIZE(gains_overflow), itimes_overflow, + ARRAY_SIZE(itimes_overflow), >s); + KUNIT_EXPECT_EQ(test, -EOVERFLOW, ret); +} + +static void test_iio_gts_find_gain_for_scale_using_time(struct kunit *test) +{ + struct device *dev; + int ret, gain_sel; + + dev = test_init_iio_gain_scale(test, >s); + if (!dev) + return; + + ret = iio_gts_find_gain_sel_for_scale_using_time(>s, TEST_TSEL_100, + TEST_SCALE_8X, 0, &gain_sel); + /* + * Meas time 100 => gain by time 2x + * TEST_SCALE_8X matches total gain 8x + * => required HWGAIN 4x + */ + KUNIT_EXPECT_EQ(test, 0, ret); + KUNIT_EXPECT_EQ(test, TEST_GSEL_4, gain_sel); + + ret = iio_gts_find_gain_sel_for_scale_using_time(>s, TEST_TSEL_200, 0, + TEST_SCALE_NANO_256X, &gain_sel); + /* + * Meas time 200 => gain by time 4x + * TEST_SCALE_256X matches total gain 256x + * => required HWGAIN 256/4 => 64x + */ + KUNIT_EXPECT_EQ(test, 0, ret); + KUNIT_EXPECT_EQ(test, TEST_GSEL_64, gain_sel); + + /* Min time, Min gain */ + ret = iio_gts_find_gain_sel_for_scale_using_time(>s, TEST_TSEL_X_MIN, + TEST_SCALE_MIN_X, 0, &gain_sel); + KUNIT_EXPECT_EQ(test, 0, ret); + KUNIT_EXPECT_EQ(test, TEST_GSEL_1, gain_sel); + + /* Max time, Max gain */ + ret = iio_gts_find_gain_sel_for_scale_using_time(>s, TEST_TSEL_X_MAX, + 0, TEST_SCALE_NANO_MAX_X, &gain_sel); + KUNIT_EXPECT_EQ(test, 0, ret); + KUNIT_EXPECT_EQ(test, TEST_GSEL_4096, gain_sel); + + ret = iio_gts_find_gain_sel_for_scale_using_time(>s, TEST_TSEL_100, 0, + TEST_SCALE_NANO_256X, &gain_sel); + /* + * Meas time 100 => gain by time 2x + * TEST_SCALE_256X matches total gain 256x + * => required HWGAIN 256/2 => 128x (not in gain-table - unsupported) + */ + KUNIT_EXPECT_NE(test, 0, ret); + + ret = iio_gts_find_gain_sel_for_scale_using_time(>s, TEST_TSEL_200, 0, + TEST_SCALE_NANO_MAX_X, &gain_sel); + /* We can't reach the max gain with integration time smaller than MAX */ + KUNIT_EXPECT_NE(test, 0, ret); + + ret = iio_gts_find_gain_sel_for_scale_using_time(>s, TEST_TSEL_50, 0, + TEST_SCALE_NANO_MAX_X, &gain_sel); + /* We can't reach the max gain with integration time smaller than MAX */ + KUNIT_EXPECT_NE(test, 0, ret); +} + +static void test_iio_gts_find_new_gain_sel_by_old_gain_time(struct kunit *test) +{ + struct device *dev; + int ret, old_gain, new_gain, old_time_sel, new_time_sel; + + dev = test_init_iio_gain_scale(test, >s); + if (!dev) + return; + + old_gain = 32; + old_time_sel = TEST_TSEL_200; + new_time_sel = TEST_TSEL_400; + + ret = iio_gts_find_new_gain_sel_by_old_gain_time(>s, old_gain, + old_time_sel, new_time_sel, &new_gain); + KUNIT_EXPECT_EQ(test, 0, ret); + /* + * Doubling the integration time doubles the total gain - so old + * (hw)gain must be divided by two to compensate. => 32 / 2 => 16 + */ + KUNIT_EXPECT_EQ(test, 16, new_gain); + + old_gain = 4; + old_time_sel = TEST_TSEL_50; + new_time_sel = TEST_TSEL_200; + ret = iio_gts_find_new_gain_sel_by_old_gain_time(>s, old_gain, + old_time_sel, new_time_sel, &new_gain); + KUNIT_EXPECT_EQ(test, 0, ret); + /* + * gain by time 1x => 4x - (hw)gain 4x => 1x + */ + KUNIT_EXPECT_EQ(test, 1, new_gain); + + old_gain = 512; + old_time_sel = TEST_TSEL_400; + new_time_sel = TEST_TSEL_50; + ret = iio_gts_find_new_gain_sel_by_old_gain_time(>s, old_gain, + old_time_sel, new_time_sel, &new_gain); + KUNIT_EXPECT_EQ(test, 0, ret); + /* + * gain by time 8x => 1x - (hw)gain 512x => 4096x) + */ + KUNIT_EXPECT_EQ(test, 4096, new_gain); + + /* Unsupported gain 2x */ + old_gain = 4; + old_time_sel = TEST_TSEL_200; + new_time_sel = TEST_TSEL_400; + ret = iio_gts_find_new_gain_sel_by_old_gain_time(>s, old_gain, + old_time_sel, new_time_sel, &new_gain); + KUNIT_EXPECT_NE(test, 0, ret); + + /* Too small gain */ + old_gain = 4; + old_time_sel = TEST_TSEL_50; + new_time_sel = TEST_TSEL_400; + ret = iio_gts_find_new_gain_sel_by_old_gain_time(>s, old_gain, + old_time_sel, new_time_sel, &new_gain); + KUNIT_EXPECT_NE(test, 0, ret); + + /* Too big gain */ + old_gain = 1024; + old_time_sel = TEST_TSEL_400; + new_time_sel = TEST_TSEL_50; + ret = iio_gts_find_new_gain_sel_by_old_gain_time(>s, old_gain, + old_time_sel, new_time_sel, &new_gain); + KUNIT_EXPECT_NE(test, 0, ret); + +} + +static void test_iio_find_closest_gain_low(struct kunit *test) +{ + struct device *dev; + bool in_range; + int ret; + + const struct iio_gain_sel_pair gts_test_gains_gain_low[] = { + GAIN_SCALE_GAIN(4, TEST_GSEL_4), + GAIN_SCALE_GAIN(16, TEST_GSEL_16), + GAIN_SCALE_GAIN(32, TEST_GSEL_32), + }; + + dev = test_init_iio_gain_scale(test, >s); + if (!dev) + return; + + ret = iio_find_closest_gain_low(>s, 2, &in_range); + KUNIT_EXPECT_EQ(test, 1, ret); + KUNIT_EXPECT_EQ(test, true, in_range); + + ret = iio_find_closest_gain_low(>s, 1, &in_range); + KUNIT_EXPECT_EQ(test, 1, ret); + KUNIT_EXPECT_EQ(test, true, in_range); + + ret = iio_find_closest_gain_low(>s, 4095, &in_range); + KUNIT_EXPECT_EQ(test, 2048, ret); + KUNIT_EXPECT_EQ(test, true, in_range); + + ret = iio_find_closest_gain_low(>s, 4097, &in_range); + KUNIT_EXPECT_EQ(test, 4096, ret); + KUNIT_EXPECT_EQ(test, false, in_range); + + kunit_device_unregister(test, dev); + + dev = __test_init_iio_gain_scale(test, >s, gts_test_gains_gain_low, + ARRAY_SIZE(gts_test_gains_gain_low), + gts_test_itimes, ARRAY_SIZE(gts_test_itimes)); + if (!dev) + return; + + ret = iio_find_closest_gain_low(>s, 3, &in_range); + KUNIT_EXPECT_EQ(test, -EINVAL, ret); + KUNIT_EXPECT_EQ(test, false, in_range); +} + +static void test_iio_gts_total_gain_to_scale(struct kunit *test) +{ + struct device *dev; + int ret, scale_int, scale_nano; + + dev = test_init_iio_gain_scale(test, >s); + if (!dev) + return; + + ret = iio_gts_total_gain_to_scale(>s, 1, &scale_int, &scale_nano); + KUNIT_EXPECT_EQ(test, 0, ret); + KUNIT_EXPECT_EQ(test, TEST_SCALE_1X, scale_int); + KUNIT_EXPECT_EQ(test, 0, scale_nano); + + ret = iio_gts_total_gain_to_scale(>s, 1, &scale_int, &scale_nano); + KUNIT_EXPECT_EQ(test, 0, ret); + KUNIT_EXPECT_EQ(test, TEST_SCALE_1X, scale_int); + KUNIT_EXPECT_EQ(test, 0, scale_nano); + + ret = iio_gts_total_gain_to_scale(>s, 4096 * 8, &scale_int, + &scale_nano); + KUNIT_EXPECT_EQ(test, 0, ret); + KUNIT_EXPECT_EQ(test, 0, scale_int); + KUNIT_EXPECT_EQ(test, TEST_SCALE_NANO_4096X8, scale_nano); +} + +static void test_iio_gts_chk_times(struct kunit *test, const int *vals) +{ + static const int expected[] = {0, 50000, 0, 100000, 0, 200000, 0, 400000}; + int i; + + for (i = 0; i < ARRAY_SIZE(expected); i++) + KUNIT_EXPECT_EQ(test, expected[i], vals[i]); +} + +static void test_iio_gts_chk_scales_all(struct kunit *test, struct iio_gts *gts, + const int *vals, int len) +{ + static const int gains[] = {1, 2, 4, 8, 16, 32, 64, 128, 256, 512, + 1024, 2048, 4096, 4096 * 2, 4096 * 4, + 4096 * 8}; + int expected[ARRAY_SIZE(gains) * 2]; + int i, ret; + int exp_len = ARRAY_SIZE(gains) * 2; + + KUNIT_EXPECT_EQ(test, exp_len, len); + if (len != exp_len) + return; + + for (i = 0; i < ARRAY_SIZE(gains); i++) { + ret = iio_gts_total_gain_to_scale(gts, gains[i], + &expected[2 * i], + &expected[2 * i + 1]); + KUNIT_EXPECT_EQ(test, 0, ret); + if (ret) + return; + } + + for (i = 0; i < ARRAY_SIZE(expected); i++) + KUNIT_EXPECT_EQ(test, expected[i], vals[i]); +} + +static void test_iio_gts_chk_scales_t200(struct kunit *test, struct iio_gts *gts, + const int *vals, int len) +{ + /* The gain caused by time 200 is 4x */ + static const int gains[] = { + 1 * 4, + 4 * 4, + 16 * 4, + 32 * 4, + 64 * 4, + 256 * 4, + 512 * 4, + 1024 * 4, + 2048 * 4, + 4096 * 4 + }; + int expected[ARRAY_SIZE(gains) * 2]; + int i, ret; + + KUNIT_EXPECT_EQ(test, 2 * ARRAY_SIZE(gains), len); + if (len < 2 * ARRAY_SIZE(gains)) + return; + + for (i = 0; i < ARRAY_SIZE(gains); i++) { + ret = iio_gts_total_gain_to_scale(gts, gains[i], + &expected[2 * i], + &expected[2 * i + 1]); + KUNIT_EXPECT_EQ(test, 0, ret); + if (ret) + return; + } + + for (i = 0; i < ARRAY_SIZE(expected); i++) + KUNIT_EXPECT_EQ(test, expected[i], vals[i]); +} + +static void test_iio_gts_avail_test(struct kunit *test) +{ + struct device *dev; + int ret; + int type, len; + const int *vals; + + dev = test_init_iio_gain_scale(test, >s); + if (!dev) + return; + + /* test table building for times and iio_gts_avail_times() */ + ret = iio_gts_avail_times(>s, &vals, &type, &len); + KUNIT_EXPECT_EQ(test, IIO_AVAIL_LIST, ret); + if (ret) + return; + + KUNIT_EXPECT_EQ(test, IIO_VAL_INT_PLUS_MICRO, type); + KUNIT_EXPECT_EQ(test, 8, len); + if (len < 8) + return; + + test_iio_gts_chk_times(test, vals); + + /* Test table building for all scales and iio_gts_all_avail_scales() */ + ret = iio_gts_all_avail_scales(>s, &vals, &type, &len); + KUNIT_EXPECT_EQ(test, IIO_AVAIL_LIST, ret); + if (ret) + return; + + KUNIT_EXPECT_EQ(test, IIO_VAL_INT_PLUS_NANO, type); + + test_iio_gts_chk_scales_all(test, >s, vals, len); + + /* + * Test table building for scales/time and + * iio_gts_avail_scales_for_time() + */ + ret = iio_gts_avail_scales_for_time(>s, 200000, &vals, &type, &len); + KUNIT_EXPECT_EQ(test, IIO_AVAIL_LIST, ret); + if (ret) + return; + + KUNIT_EXPECT_EQ(test, IIO_VAL_INT_PLUS_NANO, type); + test_iio_gts_chk_scales_t200(test, >s, vals, len); +} + +static struct kunit_case iio_gts_test_cases[] = { + KUNIT_CASE(test_init_iio_gts_invalid), + KUNIT_CASE(test_iio_gts_find_gain_for_scale_using_time), + KUNIT_CASE(test_iio_gts_find_new_gain_sel_by_old_gain_time), + KUNIT_CASE(test_iio_find_closest_gain_low), + KUNIT_CASE(test_iio_gts_total_gain_to_scale), + KUNIT_CASE(test_iio_gts_avail_test), + {} +}; + +static struct kunit_suite iio_gts_test_suite = { + .name = "iio-gain-time-scale", + .test_cases = iio_gts_test_cases, +}; + +kunit_test_suite(iio_gts_test_suite); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Matti Vaittinen "); +MODULE_DESCRIPTION("Test IIO light sensor gain-time-scale helpers"); +MODULE_IMPORT_NS(IIO_GTS_HELPER); From 724a827065cff0a65d57aa4b6a181504e13eff47 Mon Sep 17 00:00:00 2001 From: Matti Vaittinen Date: Mon, 15 Jan 2024 16:26:33 +0200 Subject: [PATCH 309/707] MAINTAINERS: add IIO GTS tests Add undersigned as a maintainer for IIO GTS helper's KUnit tests. Signed-off-by: Matti Vaittinen Link: https://lore.kernel.org/r/52c66fe2798192529738ac2ab98a27230a6ad8cd.1705328293.git.mazziesaccount@gmail.com Signed-off-by: Jonathan Cameron --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index c1a7f85fdbacfe..0d015d85e30c9f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10415,6 +10415,7 @@ L: linux-iio@vger.kernel.org S: Maintained F: drivers/iio/industrialio-gts-helper.c F: include/linux/iio/iio-gts-helper.h +F: drivers/iio/test/iio-test-gts.c IIO MULTIPLEXER M: Peter Rosin From 1d972256f02416ec05d8a126c36cc2fa0699af2a Mon Sep 17 00:00:00 2001 From: Andrew Davis Date: Tue, 23 Jan 2024 08:09:15 -0600 Subject: [PATCH 310/707] iio: health: afe4403: Use devm action helper for regulator disable Use a device lifecycle managed action for regulator disable function. This helps prevent mistakes like unregistering out of order in cleanup functions and forgetting to unregister on error paths. Signed-off-by: Andrew Davis Link: https://lore.kernel.org/r/20240123140918.215818-1-afd@ti.com Signed-off-by: Jonathan Cameron --- drivers/iio/health/afe4403.c | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/drivers/iio/health/afe4403.c b/drivers/iio/health/afe4403.c index df3bc5c3d3786d..9e9d6de2a7c83e 100644 --- a/drivers/iio/health/afe4403.c +++ b/drivers/iio/health/afe4403.c @@ -346,6 +346,13 @@ static irqreturn_t afe4403_trigger_handler(int irq, void *private) return IRQ_HANDLED; } +static void afe4403_regulator_disable(void *data) +{ + struct regulator *regulator = data; + + regulator_disable(regulator); +} + #define AFE4403_TIMING_PAIRS \ { AFE440X_LED2STC, 0x000050 }, \ { AFE440X_LED2ENDC, 0x0003e7 }, \ @@ -495,19 +502,24 @@ static int afe4403_probe(struct spi_device *spi) dev_err(afe->dev, "Unable to enable regulator\n"); return ret; } + ret = devm_add_action_or_reset(afe->dev, afe4403_regulator_disable, afe->regulator); + if (ret) { + dev_err(afe->dev, "Unable to add regulator disable action\n"); + return ret; + } ret = regmap_write(afe->regmap, AFE440X_CONTROL0, AFE440X_CONTROL0_SW_RESET); if (ret) { dev_err(afe->dev, "Unable to reset device\n"); - goto err_disable_reg; + return ret; } ret = regmap_multi_reg_write(afe->regmap, afe4403_reg_sequences, ARRAY_SIZE(afe4403_reg_sequences)); if (ret) { dev_err(afe->dev, "Unable to set register defaults\n"); - goto err_disable_reg; + return ret; } indio_dev->modes = INDIO_DIRECT_MODE; @@ -523,8 +535,7 @@ static int afe4403_probe(struct spi_device *spi) iio_device_id(indio_dev)); if (!afe->trig) { dev_err(afe->dev, "Unable to allocate IIO trigger\n"); - ret = -ENOMEM; - goto err_disable_reg; + return -ENOMEM; } iio_trigger_set_drvdata(afe->trig, indio_dev); @@ -532,7 +543,7 @@ static int afe4403_probe(struct spi_device *spi) ret = iio_trigger_register(afe->trig); if (ret) { dev_err(afe->dev, "Unable to register IIO trigger\n"); - goto err_disable_reg; + return ret; } ret = devm_request_threaded_irq(afe->dev, afe->irq, @@ -566,8 +577,6 @@ static int afe4403_probe(struct spi_device *spi) err_trig: if (afe->irq > 0) iio_trigger_unregister(afe->trig); -err_disable_reg: - regulator_disable(afe->regulator); return ret; } @@ -584,10 +593,6 @@ static void afe4403_remove(struct spi_device *spi) if (afe->irq > 0) iio_trigger_unregister(afe->trig); - - ret = regulator_disable(afe->regulator); - if (ret) - dev_warn(afe->dev, "Unable to disable regulator\n"); } static const struct spi_device_id afe4403_ids[] = { From 88fcff083c72af43a33e2c6de06852576afd20a8 Mon Sep 17 00:00:00 2001 From: Andrew Davis Date: Tue, 23 Jan 2024 08:09:16 -0600 Subject: [PATCH 311/707] iio: health: afe4403: Use devm IIO helpers Use a device lifecycle managed IIO helper functions. This helps prevent mistakes like unregistering and freeing out of order in cleanup functions and forgetting to unregister and free on error paths. Signed-off-by: Andrew Davis Link: https://lore.kernel.org/r/20240123140918.215818-2-afd@ti.com Signed-off-by: Jonathan Cameron --- drivers/iio/health/afe4403.c | 38 ++++++++---------------------------- 1 file changed, 8 insertions(+), 30 deletions(-) diff --git a/drivers/iio/health/afe4403.c b/drivers/iio/health/afe4403.c index 9e9d6de2a7c83e..1dbe48dae74eed 100644 --- a/drivers/iio/health/afe4403.c +++ b/drivers/iio/health/afe4403.c @@ -540,7 +540,7 @@ static int afe4403_probe(struct spi_device *spi) iio_trigger_set_drvdata(afe->trig, indio_dev); - ret = iio_trigger_register(afe->trig); + ret = devm_iio_trigger_register(afe->dev, afe->trig); if (ret) { dev_err(afe->dev, "Unable to register IIO trigger\n"); return ret; @@ -553,46 +553,25 @@ static int afe4403_probe(struct spi_device *spi) afe->trig); if (ret) { dev_err(afe->dev, "Unable to request IRQ\n"); - goto err_trig; + return ret; } } - ret = iio_triggered_buffer_setup(indio_dev, &iio_pollfunc_store_time, - afe4403_trigger_handler, NULL); + ret = devm_iio_triggered_buffer_setup(afe->dev, indio_dev, + &iio_pollfunc_store_time, + afe4403_trigger_handler, NULL); if (ret) { dev_err(afe->dev, "Unable to setup buffer\n"); - goto err_trig; + return ret; } - ret = iio_device_register(indio_dev); + ret = devm_iio_device_register(afe->dev, indio_dev); if (ret) { dev_err(afe->dev, "Unable to register IIO device\n"); - goto err_buff; + return ret; } return 0; - -err_buff: - iio_triggered_buffer_cleanup(indio_dev); -err_trig: - if (afe->irq > 0) - iio_trigger_unregister(afe->trig); - - return ret; -} - -static void afe4403_remove(struct spi_device *spi) -{ - struct iio_dev *indio_dev = spi_get_drvdata(spi); - struct afe4403_data *afe = iio_priv(indio_dev); - int ret; - - iio_device_unregister(indio_dev); - - iio_triggered_buffer_cleanup(indio_dev); - - if (afe->irq > 0) - iio_trigger_unregister(afe->trig); } static const struct spi_device_id afe4403_ids[] = { @@ -608,7 +587,6 @@ static struct spi_driver afe4403_spi_driver = { .pm = pm_sleep_ptr(&afe4403_pm_ops), }, .probe = afe4403_probe, - .remove = afe4403_remove, .id_table = afe4403_ids, }; module_spi_driver(afe4403_spi_driver); From d07f4b7f0413bafd3a62c677ba064aad90c7aca4 Mon Sep 17 00:00:00 2001 From: Andrew Davis Date: Tue, 23 Jan 2024 08:09:17 -0600 Subject: [PATCH 312/707] iio: health: afe4404: Use devm action helper for regulator disable Use a device lifecycle managed action for regulator disable function. This helps prevent mistakes like unregistering out of order in cleanup functions and forgetting to unregister on error paths. Signed-off-by: Andrew Davis Link: https://lore.kernel.org/r/20240123140918.215818-3-afd@ti.com Signed-off-by: Jonathan Cameron --- drivers/iio/health/afe4404.c | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/drivers/iio/health/afe4404.c b/drivers/iio/health/afe4404.c index ede1e82013118c..75a513a92242c1 100644 --- a/drivers/iio/health/afe4404.c +++ b/drivers/iio/health/afe4404.c @@ -349,6 +349,13 @@ static irqreturn_t afe4404_trigger_handler(int irq, void *private) return IRQ_HANDLED; } +static void afe4404_regulator_disable(void *data) +{ + struct regulator *regulator = data; + + regulator_disable(regulator); +} + /* Default timings from data-sheet */ #define AFE4404_TIMING_PAIRS \ { AFE440X_PRPCOUNT, 39999 }, \ @@ -502,19 +509,24 @@ static int afe4404_probe(struct i2c_client *client) dev_err(afe->dev, "Unable to enable regulator\n"); return ret; } + ret = devm_add_action_or_reset(afe->dev, afe4404_regulator_disable, afe->regulator); + if (ret) { + dev_err(afe->dev, "Unable to enable regulator\n"); + return ret; + } ret = regmap_write(afe->regmap, AFE440X_CONTROL0, AFE440X_CONTROL0_SW_RESET); if (ret) { dev_err(afe->dev, "Unable to reset device\n"); - goto disable_reg; + return ret; } ret = regmap_multi_reg_write(afe->regmap, afe4404_reg_sequences, ARRAY_SIZE(afe4404_reg_sequences)); if (ret) { dev_err(afe->dev, "Unable to set register defaults\n"); - goto disable_reg; + return ret; } indio_dev->modes = INDIO_DIRECT_MODE; @@ -530,8 +542,7 @@ static int afe4404_probe(struct i2c_client *client) iio_device_id(indio_dev)); if (!afe->trig) { dev_err(afe->dev, "Unable to allocate IIO trigger\n"); - ret = -ENOMEM; - goto disable_reg; + return -ENOMEM; } iio_trigger_set_drvdata(afe->trig, indio_dev); @@ -539,7 +550,7 @@ static int afe4404_probe(struct i2c_client *client) ret = iio_trigger_register(afe->trig); if (ret) { dev_err(afe->dev, "Unable to register IIO trigger\n"); - goto disable_reg; + return ret; } ret = devm_request_threaded_irq(afe->dev, afe->irq, @@ -549,7 +560,7 @@ static int afe4404_probe(struct i2c_client *client) afe->trig); if (ret) { dev_err(afe->dev, "Unable to request IRQ\n"); - goto disable_reg; + return ret; } } @@ -573,8 +584,6 @@ static int afe4404_probe(struct i2c_client *client) unregister_trigger: if (afe->irq > 0) iio_trigger_unregister(afe->trig); -disable_reg: - regulator_disable(afe->regulator); return ret; } @@ -591,10 +600,6 @@ static void afe4404_remove(struct i2c_client *client) if (afe->irq > 0) iio_trigger_unregister(afe->trig); - - ret = regulator_disable(afe->regulator); - if (ret) - dev_err(afe->dev, "Unable to disable regulator\n"); } static const struct i2c_device_id afe4404_ids[] = { From 07af2d40bf7a8632c1a6c74cf967a9290d830679 Mon Sep 17 00:00:00 2001 From: Andrew Davis Date: Tue, 23 Jan 2024 08:09:18 -0600 Subject: [PATCH 313/707] iio: health: afe4404: Use devm IIO helpers Use a device lifecycle managed IIO helper functions. This helps prevent mistakes like unregistering and freeing out of order in cleanup functions and forgetting to unregister and free on error paths. Signed-off-by: Andrew Davis Link: https://lore.kernel.org/r/20240123140918.215818-4-afd@ti.com Signed-off-by: Jonathan Cameron --- drivers/iio/health/afe4404.c | 36 +++++++----------------------------- 1 file changed, 7 insertions(+), 29 deletions(-) diff --git a/drivers/iio/health/afe4404.c b/drivers/iio/health/afe4404.c index 75a513a92242c1..7768b07ef7a6f8 100644 --- a/drivers/iio/health/afe4404.c +++ b/drivers/iio/health/afe4404.c @@ -547,7 +547,7 @@ static int afe4404_probe(struct i2c_client *client) iio_trigger_set_drvdata(afe->trig, indio_dev); - ret = iio_trigger_register(afe->trig); + ret = devm_iio_trigger_register(afe->dev, afe->trig); if (ret) { dev_err(afe->dev, "Unable to register IIO trigger\n"); return ret; @@ -564,42 +564,21 @@ static int afe4404_probe(struct i2c_client *client) } } - ret = iio_triggered_buffer_setup(indio_dev, &iio_pollfunc_store_time, - afe4404_trigger_handler, NULL); + ret = devm_iio_triggered_buffer_setup(afe->dev, indio_dev, + &iio_pollfunc_store_time, + afe4404_trigger_handler, NULL); if (ret) { dev_err(afe->dev, "Unable to setup buffer\n"); - goto unregister_trigger; + return ret; } - ret = iio_device_register(indio_dev); + ret = devm_iio_device_register(afe->dev, indio_dev); if (ret) { dev_err(afe->dev, "Unable to register IIO device\n"); - goto unregister_triggered_buffer; + return ret; } return 0; - -unregister_triggered_buffer: - iio_triggered_buffer_cleanup(indio_dev); -unregister_trigger: - if (afe->irq > 0) - iio_trigger_unregister(afe->trig); - - return ret; -} - -static void afe4404_remove(struct i2c_client *client) -{ - struct iio_dev *indio_dev = i2c_get_clientdata(client); - struct afe4404_data *afe = iio_priv(indio_dev); - int ret; - - iio_device_unregister(indio_dev); - - iio_triggered_buffer_cleanup(indio_dev); - - if (afe->irq > 0) - iio_trigger_unregister(afe->trig); } static const struct i2c_device_id afe4404_ids[] = { @@ -615,7 +594,6 @@ static struct i2c_driver afe4404_i2c_driver = { .pm = pm_sleep_ptr(&afe4404_pm_ops), }, .probe = afe4404_probe, - .remove = afe4404_remove, .id_table = afe4404_ids, }; module_i2c_driver(afe4404_i2c_driver); From f460d1951562aae64af600cab35619633e65b8c8 Mon Sep 17 00:00:00 2001 From: Kim Seer Paller Date: Tue, 23 Jan 2024 16:10:58 +0800 Subject: [PATCH 314/707] dt-bindings: iio: frequency: add admfm2000 Dual microwave down converter module with input RF and LO frequency ranges from 0.5 to 32 GHz and an output IF frequency range from 0.1 to 8 GHz. It consists of a LNA, mixer, IF filter, DSA, and IF amplifier for each down conversion path. Signed-off-by: Kim Seer Paller Reviewed-by: Conor Dooley Link: https://lore.kernel.org/r/20240123081059.5746-1-kimseer.paller@analog.com Signed-off-by: Jonathan Cameron --- .../bindings/iio/frequency/adi,admfm2000.yaml | 127 ++++++++++++++++++ MAINTAINERS | 7 + 2 files changed, 134 insertions(+) create mode 100644 Documentation/devicetree/bindings/iio/frequency/adi,admfm2000.yaml diff --git a/Documentation/devicetree/bindings/iio/frequency/adi,admfm2000.yaml b/Documentation/devicetree/bindings/iio/frequency/adi,admfm2000.yaml new file mode 100644 index 00000000000000..2bcf4bbc12e41f --- /dev/null +++ b/Documentation/devicetree/bindings/iio/frequency/adi,admfm2000.yaml @@ -0,0 +1,127 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +# Copyright 2024 Analog Devices Inc. +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/iio/frequency/adi,admfm2000.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: ADMFM2000 Dual Microwave Down Converter + +maintainers: + - Kim Seer Paller + +description: + Dual microwave down converter module with input RF and LO frequency ranges + from 0.5 to 32 GHz and an output IF frequency range from 0.1 to 8 GHz. + It consists of a LNA, mixer, IF filter, DSA, and IF amplifier for each down + conversion path. + +properties: + compatible: + enum: + - adi,admfm2000 + + '#address-cells': + const: 1 + + '#size-cells': + const: 0 + +patternProperties: + "^channel@[0-1]$": + type: object + description: Represents a channel of the device. + + additionalProperties: false + + properties: + reg: + description: + The channel number. + minimum: 0 + maximum: 1 + + adi,mixer-mode: + description: + Enable mixer mode for the channel. It downconverts RF between 5 GHz + and 32 GHz to IF between 0.5 GHz and 8 GHz. If not present, the channel + is in direct IF mode which bypasses the mixer and downconverts RF + between 2 GHz and 8 GHz to IF between 0.5 GHz and 8 GHz. + type: boolean + + switch-gpios: + description: | + GPIOs to select the RF path for the channel. The same state of CTRL-A + and CTRL-B GPIOs is not permitted. + CTRL-A CTRL-B CH1 Status CH2 Status + 1 0 Direct IF mode Mixer mode + 0 1 Mixer mode Direct IF mode + + items: + - description: CTRL-A GPIO + - description: CTRL-B GPIO + + attenuation-gpios: + description: | + Choice of attenuation: + DSA-V4 DSA-V3 DSA-V2 DSA-V1 DSA-V0 + 1 1 1 1 1 0 dB + 1 1 1 1 0 -1 dB + 1 1 1 0 1 -2 dB + 1 1 0 1 1 -4 dB + 1 0 1 1 1 -8 dB + 0 1 1 1 1 -16 dB + 0 0 0 0 0 -31 dB + + items: + - description: DSA-V0 GPIO + - description: DSA-V1 GPIO + - description: DSA-V2 GPIO + - description: DSA-V3 GPIO + - description: DSA-V4 GPIO + + required: + - reg + - switch-gpios + - attenuation-gpios + +required: + - compatible + +additionalProperties: false + +examples: + - | + #include + converter { + compatible = "adi,admfm2000"; + + #address-cells = <1>; + #size-cells = <0>; + + channel@0 { + reg = <0>; + switch-gpios = <&gpio 1 GPIO_ACTIVE_LOW>, + <&gpio 2 GPIO_ACTIVE_HIGH>; + + attenuation-gpios = <&gpio 17 GPIO_ACTIVE_LOW>, + <&gpio 22 GPIO_ACTIVE_LOW>, + <&gpio 23 GPIO_ACTIVE_LOW>, + <&gpio 24 GPIO_ACTIVE_LOW>, + <&gpio 25 GPIO_ACTIVE_LOW>; + }; + + channel@1 { + reg = <1>; + adi,mixer-mode; + switch-gpios = <&gpio 3 GPIO_ACTIVE_LOW>, + <&gpio 4 GPIO_ACTIVE_HIGH>; + + attenuation-gpios = <&gpio 0 GPIO_ACTIVE_LOW>, + <&gpio 5 GPIO_ACTIVE_LOW>, + <&gpio 6 GPIO_ACTIVE_LOW>, + <&gpio 16 GPIO_ACTIVE_LOW>, + <&gpio 26 GPIO_ACTIVE_LOW>; + }; + }; +... diff --git a/MAINTAINERS b/MAINTAINERS index 0d015d85e30c9f..10c8048771b48f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1277,6 +1277,13 @@ W: https://ez.analog.com/linux-software-drivers F: Documentation/devicetree/bindings/hwmon/adi,adm1177.yaml F: drivers/hwmon/adm1177.c +ANALOG DEVICES INC ADMFM2000 DRIVER +M: Kim Seer Paller +L: linux-iio@vger.kernel.org +S: Supported +W: https://ez.analog.com/linux-software-drivers +F: Documentation/devicetree/bindings/iio/frequency/adi,admfm2000.yaml + ANALOG DEVICES INC ADMV1013 DRIVER M: Antoniu Miclaus L: linux-iio@vger.kernel.org From a0295c1bd4a79461f291c9b0df0523cbbeb75560 Mon Sep 17 00:00:00 2001 From: Kim Seer Paller Date: Tue, 23 Jan 2024 16:10:59 +0800 Subject: [PATCH 315/707] iio: frequency: admfm2000: New driver Dual microwave down converter module with input RF and LO frequency ranges from 0.5 to 32 GHz and an output IF frequency range from 0.1 to 8 GHz. It consists of a LNA, mixer, IF filter, DSA, and IF amplifier for each down conversion path. Signed-off-by: Kim Seer Paller Link: https://lore.kernel.org/r/20240123081059.5746-2-kimseer.paller@analog.com Signed-off-by: Jonathan Cameron --- MAINTAINERS | 1 + drivers/iio/frequency/Kconfig | 10 ++ drivers/iio/frequency/Makefile | 1 + drivers/iio/frequency/admfm2000.c | 282 ++++++++++++++++++++++++++++++ 4 files changed, 294 insertions(+) create mode 100644 drivers/iio/frequency/admfm2000.c diff --git a/MAINTAINERS b/MAINTAINERS index 10c8048771b48f..00d354af10f5dd 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1283,6 +1283,7 @@ L: linux-iio@vger.kernel.org S: Supported W: https://ez.analog.com/linux-software-drivers F: Documentation/devicetree/bindings/iio/frequency/adi,admfm2000.yaml +F: drivers/iio/frequency/admfm2000.c ANALOG DEVICES INC ADMV1013 DRIVER M: Antoniu Miclaus diff --git a/drivers/iio/frequency/Kconfig b/drivers/iio/frequency/Kconfig index 9e85dfa585081c..c455be7d4a1c88 100644 --- a/drivers/iio/frequency/Kconfig +++ b/drivers/iio/frequency/Kconfig @@ -60,6 +60,16 @@ config ADF4377 To compile this driver as a module, choose M here: the module will be called adf4377. +config ADMFM2000 + tristate "Analog Devices ADMFM2000 Dual Microwave Down Converter" + depends on GPIOLIB + help + Say yes here to build support for Analog Devices ADMFM2000 Dual + Microwave Down Converter. + + To compile this driver as a module, choose M here: the + module will be called admfm2000. + config ADMV1013 tristate "Analog Devices ADMV1013 Microwave Upconverter" depends on SPI && COMMON_CLK diff --git a/drivers/iio/frequency/Makefile b/drivers/iio/frequency/Makefile index b616c29b4a0873..70d0e0b70e8021 100644 --- a/drivers/iio/frequency/Makefile +++ b/drivers/iio/frequency/Makefile @@ -8,6 +8,7 @@ obj-$(CONFIG_AD9523) += ad9523.o obj-$(CONFIG_ADF4350) += adf4350.o obj-$(CONFIG_ADF4371) += adf4371.o obj-$(CONFIG_ADF4377) += adf4377.o +obj-$(CONFIG_ADMFM2000) += admfm2000.o obj-$(CONFIG_ADMV1013) += admv1013.o obj-$(CONFIG_ADMV1014) += admv1014.o obj-$(CONFIG_ADMV4420) += admv4420.o diff --git a/drivers/iio/frequency/admfm2000.c b/drivers/iio/frequency/admfm2000.c new file mode 100644 index 00000000000000..c34d79e55a7c58 --- /dev/null +++ b/drivers/iio/frequency/admfm2000.c @@ -0,0 +1,282 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * ADMFM2000 Dual Microwave Down Converter + * + * Copyright 2024 Analog Devices Inc. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define ADMFM2000_MIXER_MODE 0 +#define ADMFM2000_DIRECT_IF_MODE 1 +#define ADMFM2000_DSA_GPIOS 5 +#define ADMFM2000_MODE_GPIOS 2 +#define ADMFM2000_MAX_GAIN 0 +#define ADMFM2000_MIN_GAIN -31000 +#define ADMFM2000_DEFAULT_GAIN -0x20 + +struct admfm2000_state { + struct mutex lock; /* protect sensor state */ + struct gpio_desc *sw1_ch[2]; + struct gpio_desc *sw2_ch[2]; + struct gpio_desc *dsa1_gpios[5]; + struct gpio_desc *dsa2_gpios[5]; + u32 gain[2]; +}; + +static int admfm2000_mode(struct iio_dev *indio_dev, u32 chan, u32 mode) +{ + struct admfm2000_state *st = iio_priv(indio_dev); + int i; + + switch (mode) { + case ADMFM2000_MIXER_MODE: + for (i = 0; i < ADMFM2000_MODE_GPIOS; i++) { + gpiod_set_value_cansleep(st->sw1_ch[i], (chan == 0) ? 1 : 0); + gpiod_set_value_cansleep(st->sw2_ch[i], (chan == 0) ? 0 : 1); + } + return 0; + case ADMFM2000_DIRECT_IF_MODE: + for (i = 0; i < ADMFM2000_MODE_GPIOS; i++) { + gpiod_set_value_cansleep(st->sw1_ch[i], (chan == 0) ? 0 : 1); + gpiod_set_value_cansleep(st->sw2_ch[i], (chan == 0) ? 1 : 0); + } + return 0; + default: + return -EINVAL; + } +} + +static int admfm2000_attenuation(struct iio_dev *indio_dev, u32 chan, u32 value) +{ + struct admfm2000_state *st = iio_priv(indio_dev); + int i; + + switch (chan) { + case 0: + for (i = 0; i < ADMFM2000_DSA_GPIOS; i++) + gpiod_set_value_cansleep(st->dsa1_gpios[i], value & (1 << i)); + return 0; + case 1: + for (i = 0; i < ADMFM2000_DSA_GPIOS; i++) + gpiod_set_value_cansleep(st->dsa2_gpios[i], value & (1 << i)); + return 0; + default: + return -EINVAL; + } +} + +static int admfm2000_read_raw(struct iio_dev *indio_dev, + struct iio_chan_spec const *chan, int *val, + int *val2, long mask) +{ + struct admfm2000_state *st = iio_priv(indio_dev); + int gain; + + switch (mask) { + case IIO_CHAN_INFO_HARDWAREGAIN: + mutex_lock(&st->lock); + gain = ~(st->gain[chan->channel]) * -1000; + *val = gain / 1000; + *val2 = (gain % 1000) * 1000; + mutex_unlock(&st->lock); + + return IIO_VAL_INT_PLUS_MICRO_DB; + default: + return -EINVAL; + } +} + +static int admfm2000_write_raw(struct iio_dev *indio_dev, + struct iio_chan_spec const *chan, int val, + int val2, long mask) +{ + struct admfm2000_state *st = iio_priv(indio_dev); + int gain, ret; + + if (val < 0) + gain = (val * 1000) - (val2 / 1000); + else + gain = (val * 1000) + (val2 / 1000); + + if (gain > ADMFM2000_MAX_GAIN || gain < ADMFM2000_MIN_GAIN) + return -EINVAL; + + switch (mask) { + case IIO_CHAN_INFO_HARDWAREGAIN: + mutex_lock(&st->lock); + st->gain[chan->channel] = ~((abs(gain) / 1000) & 0x1F); + + ret = admfm2000_attenuation(indio_dev, chan->channel, + st->gain[chan->channel]); + mutex_unlock(&st->lock); + return ret; + default: + return -EINVAL; + } +} + +static int admfm2000_write_raw_get_fmt(struct iio_dev *indio_dev, + struct iio_chan_spec const *chan, + long mask) +{ + switch (mask) { + case IIO_CHAN_INFO_HARDWAREGAIN: + return IIO_VAL_INT_PLUS_MICRO_DB; + default: + return -EINVAL; + } +} + +static const struct iio_info admfm2000_info = { + .read_raw = &admfm2000_read_raw, + .write_raw = &admfm2000_write_raw, + .write_raw_get_fmt = &admfm2000_write_raw_get_fmt, +}; + +#define ADMFM2000_CHAN(_channel) { \ + .type = IIO_VOLTAGE, \ + .output = 1, \ + .indexed = 1, \ + .channel = _channel, \ + .info_mask_separate = BIT(IIO_CHAN_INFO_HARDWAREGAIN), \ +} + +static const struct iio_chan_spec admfm2000_channels[] = { + ADMFM2000_CHAN(0), + ADMFM2000_CHAN(1), +}; + +static int admfm2000_channel_config(struct admfm2000_state *st, + struct iio_dev *indio_dev) +{ + struct platform_device *pdev = to_platform_device(indio_dev->dev.parent); + struct device *dev = &pdev->dev; + struct fwnode_handle *child; + struct gpio_desc **dsa; + struct gpio_desc **sw; + int ret, i; + bool mode; + u32 reg; + + device_for_each_child_node(dev, child) { + ret = fwnode_property_read_u32(child, "reg", ®); + if (ret) { + fwnode_handle_put(child); + return dev_err_probe(dev, ret, + "Failed to get reg property\n"); + } + + if (reg >= indio_dev->num_channels) { + fwnode_handle_put(child); + return dev_err_probe(dev, -EINVAL, "reg bigger than: %d\n", + indio_dev->num_channels); + } + + if (fwnode_property_present(child, "adi,mixer-mode")) + mode = ADMFM2000_MIXER_MODE; + else + mode = ADMFM2000_DIRECT_IF_MODE; + + switch (reg) { + case 0: + sw = st->sw1_ch; + dsa = st->dsa1_gpios; + break; + case 1: + sw = st->sw2_ch; + dsa = st->dsa2_gpios; + break; + default: + fwnode_handle_put(child); + return -EINVAL; + } + + for (i = 0; i < ADMFM2000_MODE_GPIOS; i++) { + sw[i] = devm_fwnode_gpiod_get_index(dev, child, "switch", + i, GPIOD_OUT_LOW, NULL); + if (IS_ERR(sw[i])) { + fwnode_handle_put(child); + return dev_err_probe(dev, PTR_ERR(sw[i]), + "Failed to get gpios\n"); + } + } + + for (i = 0; i < ADMFM2000_DSA_GPIOS; i++) { + dsa[i] = devm_fwnode_gpiod_get_index(dev, child, + "attenuation", i, + GPIOD_OUT_LOW, NULL); + if (IS_ERR(dsa[i])) { + fwnode_handle_put(child); + return dev_err_probe(dev, PTR_ERR(dsa[i]), + "Failed to get gpios\n"); + } + } + + ret = admfm2000_mode(indio_dev, reg, mode); + if (ret) { + fwnode_handle_put(child); + return ret; + } + } + + return 0; +} + +static int admfm2000_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct admfm2000_state *st; + struct iio_dev *indio_dev; + int ret; + + indio_dev = devm_iio_device_alloc(dev, sizeof(*st)); + if (!indio_dev) + return -ENOMEM; + + st = iio_priv(indio_dev); + + indio_dev->name = "admfm2000"; + indio_dev->num_channels = ARRAY_SIZE(admfm2000_channels); + indio_dev->channels = admfm2000_channels; + indio_dev->info = &admfm2000_info; + indio_dev->modes = INDIO_DIRECT_MODE; + + st->gain[0] = ADMFM2000_DEFAULT_GAIN; + st->gain[1] = ADMFM2000_DEFAULT_GAIN; + + mutex_init(&st->lock); + + ret = admfm2000_channel_config(st, indio_dev); + if (ret) + return ret; + + return devm_iio_device_register(dev, indio_dev); +} + +static const struct of_device_id admfm2000_of_match[] = { + { .compatible = "adi,admfm2000" }, + { } +}; +MODULE_DEVICE_TABLE(of, admfm2000_of_match); + +static struct platform_driver admfm2000_driver = { + .driver = { + .name = "admfm2000", + .of_match_table = admfm2000_of_match, + }, + .probe = admfm2000_probe, +}; +module_platform_driver(admfm2000_driver); + +MODULE_AUTHOR("Kim Seer Paller "); +MODULE_DESCRIPTION("ADMFM2000 Dual Microwave Down Converter"); +MODULE_LICENSE("GPL"); From 83253fc066b969c9c86acb44c8dd090f4f39e820 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 26 Jan 2024 23:19:16 +0800 Subject: [PATCH 316/707] f2fs: support printk_ratelimited() in f2fs_printk() This patch supports using printk_ratelimited() in f2fs_printk(), and wrap ratelimited f2fs_printk() into f2fs_{err,warn,info}_ratelimited(), then, use these new helps to clean up codes. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 10 +++++----- fs/f2fs/dir.c | 5 ++--- fs/f2fs/f2fs.h | 40 +++++++++++++++++++++++----------------- fs/f2fs/super.c | 11 ++++++++--- 4 files changed, 38 insertions(+), 28 deletions(-) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index ff26b49c0d71ff..0fd839358c1576 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -512,8 +512,8 @@ static int lzorle_compress_pages(struct compress_ctx *cc) ret = lzorle1x_1_compress(cc->rbuf, cc->rlen, cc->cbuf->cdata, &cc->clen, cc->private); if (ret != LZO_E_OK) { - printk_ratelimited("%sF2FS-fs (%s): lzo-rle compress failed, ret:%d\n", - KERN_ERR, F2FS_I_SB(cc->inode)->sb->s_id, ret); + f2fs_err_ratelimited(F2FS_I_SB(cc->inode), + "lzo-rle compress failed, ret:%d", ret); return -EIO; } return 0; @@ -780,9 +780,9 @@ void f2fs_decompress_cluster(struct decompress_io_ctx *dic, bool in_task) if (provided != calculated) { if (!is_inode_flag_set(dic->inode, FI_COMPRESS_CORRUPT)) { set_inode_flag(dic->inode, FI_COMPRESS_CORRUPT); - printk_ratelimited( - "%sF2FS-fs (%s): checksum invalid, nid = %lu, %x vs %x", - KERN_INFO, sbi->sb->s_id, dic->inode->i_ino, + f2fs_info_ratelimited(sbi, + "checksum invalid, nid = %lu, %x vs %x", + dic->inode->i_ino, provided, calculated); } set_sbi_flag(sbi, SBI_NEED_FSCK); diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 042593aed1ec0a..3f20d94e12f90a 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -995,9 +995,8 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, de = &d->dentry[bit_pos]; if (de->name_len == 0) { if (found_valid_dirent || !bit_pos) { - printk_ratelimited( - "%sF2FS-fs (%s): invalid namelen(0), ino:%u, run fsck to fix.", - KERN_WARNING, sbi->sb->s_id, + f2fs_warn_ratelimited(sbi, + "invalid namelen(0), ino:%u, run fsck to fix.", le32_to_cpu(de->ino)); set_sbi_flag(sbi, SBI_NEED_FSCK); } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 4481f68d64181c..b4b737e43a6b3a 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1812,6 +1812,27 @@ struct f2fs_sb_info { #endif }; +__printf(3, 4) +void f2fs_printk(struct f2fs_sb_info *sbi, bool limit_rate, const char *fmt, ...); + +#define f2fs_err(sbi, fmt, ...) \ + f2fs_printk(sbi, false, KERN_ERR fmt, ##__VA_ARGS__) +#define f2fs_warn(sbi, fmt, ...) \ + f2fs_printk(sbi, false, KERN_WARNING fmt, ##__VA_ARGS__) +#define f2fs_notice(sbi, fmt, ...) \ + f2fs_printk(sbi, false, KERN_NOTICE fmt, ##__VA_ARGS__) +#define f2fs_info(sbi, fmt, ...) \ + f2fs_printk(sbi, false, KERN_INFO fmt, ##__VA_ARGS__) +#define f2fs_debug(sbi, fmt, ...) \ + f2fs_printk(sbi, false, KERN_DEBUG fmt, ##__VA_ARGS__) + +#define f2fs_err_ratelimited(sbi, fmt, ...) \ + f2fs_printk(sbi, true, KERN_ERR fmt, ##__VA_ARGS__) +#define f2fs_warn_ratelimited(sbi, fmt, ...) \ + f2fs_printk(sbi, true, KERN_WARNING fmt, ##__VA_ARGS__) +#define f2fs_info_ratelimited(sbi, fmt, ...) \ + f2fs_printk(sbi, true, KERN_INFO fmt, ##__VA_ARGS__) + #ifdef CONFIG_F2FS_FAULT_INJECTION #define time_to_inject(sbi, type) __time_to_inject(sbi, type, __func__, \ __builtin_return_address(0)) @@ -1829,9 +1850,8 @@ static inline bool __time_to_inject(struct f2fs_sb_info *sbi, int type, atomic_inc(&ffi->inject_ops); if (atomic_read(&ffi->inject_ops) >= ffi->inject_rate) { atomic_set(&ffi->inject_ops, 0); - printk_ratelimited("%sF2FS-fs (%s) : inject %s in %s of %pS\n", - KERN_INFO, sbi->sb->s_id, f2fs_fault_name[type], - func, parent_func); + f2fs_info_ratelimited(sbi, "inject %s in %s of %pS", + f2fs_fault_name[type], func, parent_func); return true; } return false; @@ -2325,20 +2345,6 @@ static inline int inc_valid_block_count(struct f2fs_sb_info *sbi, return -ENOSPC; } -__printf(2, 3) -void f2fs_printk(struct f2fs_sb_info *sbi, const char *fmt, ...); - -#define f2fs_err(sbi, fmt, ...) \ - f2fs_printk(sbi, KERN_ERR fmt, ##__VA_ARGS__) -#define f2fs_warn(sbi, fmt, ...) \ - f2fs_printk(sbi, KERN_WARNING fmt, ##__VA_ARGS__) -#define f2fs_notice(sbi, fmt, ...) \ - f2fs_printk(sbi, KERN_NOTICE fmt, ##__VA_ARGS__) -#define f2fs_info(sbi, fmt, ...) \ - f2fs_printk(sbi, KERN_INFO fmt, ##__VA_ARGS__) -#define f2fs_debug(sbi, fmt, ...) \ - f2fs_printk(sbi, KERN_DEBUG fmt, ##__VA_ARGS__) - #define PAGE_PRIVATE_GET_FUNC(name, flagname) \ static inline bool page_private_##name(struct page *page) \ { \ diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index e2c066fbc0fa11..3e2a5e3b3e9919 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -264,7 +264,8 @@ static match_table_t f2fs_tokens = { {Opt_err, NULL}, }; -void f2fs_printk(struct f2fs_sb_info *sbi, const char *fmt, ...) +void f2fs_printk(struct f2fs_sb_info *sbi, bool limit_rate, + const char *fmt, ...) { struct va_format vaf; va_list args; @@ -275,8 +276,12 @@ void f2fs_printk(struct f2fs_sb_info *sbi, const char *fmt, ...) level = printk_get_level(fmt); vaf.fmt = printk_skip_level(fmt); vaf.va = &args; - printk("%c%cF2FS-fs (%s): %pV\n", - KERN_SOH_ASCII, level, sbi->sb->s_id, &vaf); + if (limit_rate) + printk_ratelimited("%c%cF2FS-fs (%s): %pV\n", + KERN_SOH_ASCII, level, sbi->sb->s_id, &vaf); + else + printk("%c%cF2FS-fs (%s): %pV\n", + KERN_SOH_ASCII, level, sbi->sb->s_id, &vaf); va_end(args); } From 9a63d6e6382a93b8c6fa7f53d1ec64d9b87e7293 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 26 Jan 2024 23:19:17 +0800 Subject: [PATCH 317/707] f2fs: use f2fs_err_ratelimited() to avoid redundant logs Use f2fs_err_ratelimited() to instead f2fs_err() in f2fs_record_stop_reason() and f2fs_record_errors() to avoid redundant logs. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 3e2a5e3b3e9919..1b718bebfaa106 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -4096,7 +4096,9 @@ static void f2fs_record_stop_reason(struct f2fs_sb_info *sbi) f2fs_up_write(&sbi->sb_lock); if (err) - f2fs_err(sbi, "f2fs_commit_super fails to record err:%d", err); + f2fs_err_ratelimited(sbi, + "f2fs_commit_super fails to record stop_reason, err:%d", + err); } void f2fs_save_errors(struct f2fs_sb_info *sbi, unsigned char flag) @@ -4139,8 +4141,9 @@ static void f2fs_record_errors(struct f2fs_sb_info *sbi, unsigned char error) err = f2fs_commit_super(sbi, false); if (err) - f2fs_err(sbi, "f2fs_commit_super fails to record errors:%u, err:%d", - error, err); + f2fs_err_ratelimited(sbi, + "f2fs_commit_super fails to record errors:%u, err:%d", + error, err); out_unlock: f2fs_up_write(&sbi->sb_lock); } From 9f100ecdedc3f9a5d8a7aeb5b53bc12659825f9f Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 22 Jan 2024 10:23:13 +0800 Subject: [PATCH 318/707] f2fs: compress: fix to cover f2fs_disable_compressed_file() w/ i_sem - f2fs_disable_compressed_file - check inode_has_data - f2fs_file_mmap - mkwrite - f2fs_get_block_locked : update metadata in compressed inode's disk layout - fi->i_flags &= ~F2FS_COMPR_FL - clear_inode_flag(inode, FI_COMPRESSED_FILE); we should use i_sem lock to prevent above race case. Fixes: 4c8ff7095bef ("f2fs: support data compression") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index b4b737e43a6b3a..40d428636532ab 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -4415,15 +4415,24 @@ static inline bool f2fs_disable_compressed_file(struct inode *inode) { struct f2fs_inode_info *fi = F2FS_I(inode); - if (!f2fs_compressed_file(inode)) + f2fs_down_write(&F2FS_I(inode)->i_sem); + + if (!f2fs_compressed_file(inode)) { + f2fs_up_write(&F2FS_I(inode)->i_sem); return true; - if (S_ISREG(inode->i_mode) && F2FS_HAS_BLOCKS(inode)) + } + if (f2fs_is_mmap_file(inode) || + (S_ISREG(inode->i_mode) && F2FS_HAS_BLOCKS(inode))) { + f2fs_up_write(&F2FS_I(inode)->i_sem); return false; + } fi->i_flags &= ~F2FS_COMPR_FL; stat_dec_compr_inode(inode); clear_inode_flag(inode, FI_COMPRESSED_FILE); f2fs_mark_inode_dirty_sync(inode, true); + + f2fs_up_write(&F2FS_I(inode)->i_sem); return true; } From 16c326c7a519c6148766cc78c8a251bd7b62345d Mon Sep 17 00:00:00 2001 From: Zhiguo Niu Date: Thu, 18 Jan 2024 13:48:31 +0800 Subject: [PATCH 319/707] f2fs: compress: remove some redundant codes in f2fs_cache_compressed_page Just remove some redundant codes, no logic change. Signed-off-by: Zhiguo Niu Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 0fd839358c1576..3dc488ce882be6 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -1889,12 +1889,8 @@ void f2fs_cache_compressed_page(struct f2fs_sb_info *sbi, struct page *page, set_page_private_data(cpage, ino); - if (!f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE_READ)) - goto out; - memcpy(page_address(cpage), page_address(page), PAGE_SIZE); SetPageUptodate(cpage); -out: f2fs_put_page(cpage, 1); } From 5e9f083a7ae81058ceffd91e00c6fea3e7078dce Mon Sep 17 00:00:00 2001 From: Zhiguo Niu Date: Wed, 17 Jan 2024 15:59:58 +0800 Subject: [PATCH 320/707] f2fs: use IS_INODE replace IS_DNODE in f2fs_flush_inline_data Now IS_DNODE is used in f2fs_flush_inline_data and it has some problems: 1. Just only inodes may include inline data,not all direct nodes 2. When system IO is busy, it is inefficient to lock a direct node page but not an inode page. Besides, if this direct node page is being locked by others for IO, f2fs_flush_inline_data will be blocked here, which will affects the checkpoint process, this is unreasonable. So IS_INODE should be used in f2fs_flush_inline_data. Signed-off-by: Zhiguo Niu Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 9b546fd2101004..1d898a16f05a1d 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1919,7 +1919,7 @@ void f2fs_flush_inline_data(struct f2fs_sb_info *sbi) for (i = 0; i < nr_folios; i++) { struct page *page = &fbatch.folios[i]->page; - if (!IS_DNODE(page)) + if (!IS_INODE(page)) continue; lock_page(page); From f31438c16879f0612fae83f02b11367c906a7d00 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 24 Jan 2024 22:49:15 +0800 Subject: [PATCH 321/707] f2fs: fix to avoid potential panic during recovery During recovery, if FAULT_BLOCK is on, it is possible that f2fs_reserve_new_block() will return -ENOSPC during recovery, then it may trigger panic. Also, if fault injection rate is 1 and only FAULT_BLOCK fault type is on, it may encounter deadloop in loop of block reservation. Let's change as below to fix these issues: - remove bug_on() to avoid panic. - limit the loop count of block reservation to avoid potential deadloop. Fixes: 956fa1ddc132 ("f2fs: fix to check return value of f2fs_reserve_new_block()") Reported-by: Zhiguo Niu Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 5 +++++ fs/f2fs/recovery.c | 33 ++++++++++++++++----------------- 2 files changed, 21 insertions(+), 17 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 40d428636532ab..543898482f8b0c 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -76,6 +76,11 @@ struct f2fs_fault_info { extern const char *f2fs_fault_name[FAULT_MAX]; #define IS_FAULT_SET(fi, type) ((fi)->inject_type & BIT(type)) + +/* maximum retry count for injected failure */ +#define DEFAULT_FAILURE_RETRY_COUNT 8 +#else +#define DEFAULT_FAILURE_RETRY_COUNT 1 #endif /* diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index d0f24ccbd1ac6e..aad1d1a9b3d647 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -611,6 +611,19 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, return 0; } +static int f2fs_reserve_new_block_retry(struct dnode_of_data *dn) +{ + int i, err = 0; + + for (i = DEFAULT_FAILURE_RETRY_COUNT; i > 0; i--) { + err = f2fs_reserve_new_block(dn); + if (!err) + break; + } + + return err; +} + static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, struct page *page) { @@ -712,14 +725,8 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, */ if (dest == NEW_ADDR) { f2fs_truncate_data_blocks_range(&dn, 1); - do { - err = f2fs_reserve_new_block(&dn); - if (err == -ENOSPC) { - f2fs_bug_on(sbi, 1); - break; - } - } while (err && - IS_ENABLED(CONFIG_F2FS_FAULT_INJECTION)); + + err = f2fs_reserve_new_block_retry(&dn); if (err) goto err; continue; @@ -727,16 +734,8 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, /* dest is valid block, try to recover from src to dest */ if (f2fs_is_valid_blkaddr(sbi, dest, META_POR)) { - if (src == NULL_ADDR) { - do { - err = f2fs_reserve_new_block(&dn); - if (err == -ENOSPC) { - f2fs_bug_on(sbi, 1); - break; - } - } while (err && - IS_ENABLED(CONFIG_F2FS_FAULT_INJECTION)); + err = f2fs_reserve_new_block_retry(&dn); if (err) goto err; } From 55bc47ddbfbadc16e286891f2b3b6c92c2eeeeff Mon Sep 17 00:00:00 2001 From: Li Ming Date: Mon, 29 Jan 2024 13:18:56 +0000 Subject: [PATCH 322/707] cxl/pci: Skip to handle RAS errors if CXL.mem device is detached The PCI AER model is an awkward fit for CXL error handling. While the expectation is that a PCI device can escalate to link reset to recover from an AER event, the same reset on CXL amounts to a suprise memory hotplug of massive amounts of memory. At present, the CXL error handler attempts some optimisitic error handling to unbind the device from the cxl_mem driver after reaping some RAS register values. This results in a "hopeful" attempt to unplug the memory, but there is no guarantee that will succeed. A subsequent AER notification after the memdev unbind event can no longer assume the registers are mapped. Check for memdev bind before reaping status register values to avoid crashes of the form: BUG: unable to handle page fault for address: ffa00000195e9100 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page [...] RIP: 0010:__cxl_handle_ras+0x30/0x110 [cxl_core] [...] Call Trace: ? __die+0x24/0x70 ? page_fault_oops+0x82/0x160 ? kernelmode_fixup_or_oops+0x84/0x110 ? exc_page_fault+0x113/0x170 ? asm_exc_page_fault+0x26/0x30 ? __pfx_dpc_reset_link+0x10/0x10 ? __cxl_handle_ras+0x30/0x110 [cxl_core] ? find_cxl_port+0x59/0x80 [cxl_core] cxl_handle_rp_ras+0xbc/0xd0 [cxl_core] cxl_error_detected+0x6c/0xf0 [cxl_core] report_error_detected+0xc7/0x1c0 pci_walk_bus+0x73/0x90 pcie_do_recovery+0x23f/0x330 Longer term, the unbind and PCI_ERS_RESULT_DISCONNECT behavior might need to be replaced with a new PCI_ERS_RESULT_PANIC. Fixes: 6ac07883dbb5 ("cxl/pci: Add RCH downstream port error logging") Cc: stable@vger.kernel.org Suggested-by: Dan Williams Signed-off-by: Li Ming Link: https://lore.kernel.org/r/20240129131856.2458980-1-ming4.li@intel.com Signed-off-by: Dan Williams --- drivers/cxl/core/pci.c | 43 ++++++++++++++++++++++++++++++------------ 1 file changed, 31 insertions(+), 12 deletions(-) diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c index 6c9c8d92f8f714..480489f5644e18 100644 --- a/drivers/cxl/core/pci.c +++ b/drivers/cxl/core/pci.c @@ -932,11 +932,21 @@ static void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds) { } void cxl_cor_error_detected(struct pci_dev *pdev) { struct cxl_dev_state *cxlds = pci_get_drvdata(pdev); + struct device *dev = &cxlds->cxlmd->dev; + + scoped_guard(device, dev) { + if (!dev->driver) { + dev_warn(&pdev->dev, + "%s: memdev disabled, abort error handling\n", + dev_name(dev)); + return; + } - if (cxlds->rcd) - cxl_handle_rdport_errors(cxlds); + if (cxlds->rcd) + cxl_handle_rdport_errors(cxlds); - cxl_handle_endpoint_cor_ras(cxlds); + cxl_handle_endpoint_cor_ras(cxlds); + } } EXPORT_SYMBOL_NS_GPL(cxl_cor_error_detected, CXL); @@ -948,16 +958,25 @@ pci_ers_result_t cxl_error_detected(struct pci_dev *pdev, struct device *dev = &cxlmd->dev; bool ue; - if (cxlds->rcd) - cxl_handle_rdport_errors(cxlds); + scoped_guard(device, dev) { + if (!dev->driver) { + dev_warn(&pdev->dev, + "%s: memdev disabled, abort error handling\n", + dev_name(dev)); + return PCI_ERS_RESULT_DISCONNECT; + } + + if (cxlds->rcd) + cxl_handle_rdport_errors(cxlds); + /* + * A frozen channel indicates an impending reset which is fatal to + * CXL.mem operation, and will likely crash the system. On the off + * chance the situation is recoverable dump the status of the RAS + * capability registers and bounce the active state of the memdev. + */ + ue = cxl_handle_endpoint_ras(cxlds); + } - /* - * A frozen channel indicates an impending reset which is fatal to - * CXL.mem operation, and will likely crash the system. On the off - * chance the situation is recoverable dump the status of the RAS - * capability registers and bounce the active state of the memdev. - */ - ue = cxl_handle_endpoint_ras(cxlds); switch (state) { case pci_channel_io_normal: From eeab239d6a2418fc5d2cd7ea76187085a97acde0 Mon Sep 17 00:00:00 2001 From: Fullway Wang Date: Thu, 18 Jan 2024 15:52:49 +0800 Subject: [PATCH 323/707] ASoC: wcd934x: fix an incorrect use of kstrndup() In wcd934x_codec_enable_dec(), kstrndup() is used to alloc memory. However, kmemdup_nul() should be used instead with the size known. This is similar to CVE-2019-12454 which was fixed in commit a549881. Signed-off-by: Fullway Wang Link: https://msgid.link/r/PH7PR20MB59255EF9DFFB022CB1BBB574BF712@PH7PR20MB5925.namprd20.prod.outlook.com Signed-off-by: Mark Brown --- sound/soc/codecs/wcd934x.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/soc/codecs/wcd934x.c b/sound/soc/codecs/wcd934x.c index 6813268e6a19f3..eaed17760ddda9 100644 --- a/sound/soc/codecs/wcd934x.c +++ b/sound/soc/codecs/wcd934x.c @@ -4989,7 +4989,7 @@ static int wcd934x_codec_enable_dec(struct snd_soc_dapm_widget *w, char *dec; u8 hpf_coff_freq; - widget_name = kstrndup(w->name, 15, GFP_KERNEL); + widget_name = kmemdup_nul(w->name, 15, GFP_KERNEL); if (!widget_name) return -ENOMEM; From 94a571344df867011b60e7c1399dea29410d7bd4 Mon Sep 17 00:00:00 2001 From: Alison Schofield Date: Fri, 12 Jan 2024 12:09:50 -0800 Subject: [PATCH 324/707] x86/numa: Fix the address overlap check in numa_fill_memblks() numa_fill_memblks() fills in the gaps in numa_meminfo memblks over a physical address range. To do so, it first creates a list of existing memblks that overlap that address range. The issue is that it is off by one when comparing to the end of the address range, so memblks that do not overlap are selected. The impact of selecting a memblk that does not actually overlap is that an existing memblk may be filled when the expected action is to do nothing and return NUMA_NO_MEMBLK to the caller. The caller can then add a new NUMA node and memblk. Replace the broken open-coded search for address overlap with the memblock helper memblock_addrs_overlap(). Update the kernel doc and in code comments. Suggested by: "Huang, Ying" Fixes: 8f012db27c95 ("x86/numa: Introduce numa_fill_memblks()") Signed-off-by: Alison Schofield Acked-by: Mike Rapoport (IBM) Acked-by: Dave Hansen Reviewed-by: Dan Williams Link: https://lore.kernel.org/r/10a3e6109c34c21a8dd4c513cf63df63481a2b07.1705085543.git.alison.schofield@intel.com Signed-off-by: Dan Williams --- arch/x86/mm/numa.c | 19 +++++++------------ include/linux/memblock.h | 2 ++ mm/memblock.c | 5 +++-- 3 files changed, 12 insertions(+), 14 deletions(-) diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index adc497b93f0374..8ada9bbfad583f 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -944,14 +944,12 @@ static struct numa_memblk *numa_memblk_list[NR_NODE_MEMBLKS] __initdata; * @start: address to begin fill * @end: address to end fill * - * Find and extend numa_meminfo memblks to cover the @start-@end - * physical address range, such that the first memblk includes - * @start, the last memblk includes @end, and any gaps in between - * are filled. + * Find and extend numa_meminfo memblks to cover the physical + * address range @start-@end * * RETURNS: * 0 : Success - * NUMA_NO_MEMBLK : No memblk exists in @start-@end range + * NUMA_NO_MEMBLK : No memblks exist in address range @start-@end */ int __init numa_fill_memblks(u64 start, u64 end) @@ -963,17 +961,14 @@ int __init numa_fill_memblks(u64 start, u64 end) /* * Create a list of pointers to numa_meminfo memblks that - * overlap start, end. Exclude (start == bi->end) since - * end addresses in both a CFMWS range and a memblk range - * are exclusive. - * - * This list of pointers is used to make in-place changes - * that fill out the numa_meminfo memblks. + * overlap start, end. The list is used to make in-place + * changes that fill out the numa_meminfo memblks. */ for (int i = 0; i < mi->nr_blks; i++) { struct numa_memblk *bi = &mi->blk[i]; - if (start < bi->end && end >= bi->start) { + if (memblock_addrs_overlap(start, end - start, bi->start, + bi->end - bi->start)) { blk[count] = &mi->blk[i]; count++; } diff --git a/include/linux/memblock.h b/include/linux/memblock.h index b695f9e946dabb..e2082240586d00 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -121,6 +121,8 @@ int memblock_reserve(phys_addr_t base, phys_addr_t size); int memblock_physmem_add(phys_addr_t base, phys_addr_t size); #endif void memblock_trim_memory(phys_addr_t align); +unsigned long memblock_addrs_overlap(phys_addr_t base1, phys_addr_t size1, + phys_addr_t base2, phys_addr_t size2); bool memblock_overlaps_region(struct memblock_type *type, phys_addr_t base, phys_addr_t size); bool memblock_validate_numa_coverage(unsigned long threshold_bytes); diff --git a/mm/memblock.c b/mm/memblock.c index 4dcb2ee35eca85..964eb72db539e1 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -180,8 +180,9 @@ static inline phys_addr_t memblock_cap_size(phys_addr_t base, phys_addr_t *size) /* * Address comparison utilities */ -static unsigned long __init_memblock memblock_addrs_overlap(phys_addr_t base1, phys_addr_t size1, - phys_addr_t base2, phys_addr_t size2) +unsigned long __init_memblock +memblock_addrs_overlap(phys_addr_t base1, phys_addr_t size1, phys_addr_t base2, + phys_addr_t size2) { return ((base1 < (base2 + size2)) && (base2 < (base1 + size1))); } From 6be99530c92c6b8ff7a01903edc42393575ad63b Mon Sep 17 00:00:00 2001 From: Alison Schofield Date: Fri, 12 Jan 2024 12:09:51 -0800 Subject: [PATCH 325/707] x86/numa: Fix the sort compare func used in numa_fill_memblks() The compare function used to sort memblks into starting address order fails when the result of its u64 address subtraction gets truncated to an int upon return. The impact of the bad sort is that memblks will be filled out incorrectly. Depending on the set of memblks, a user may see no errors at all but still have a bad fill, or see messages reporting a node overlap that leads to numa init failure: [] node 0 [mem: ] overlaps with node 1 [mem: ] [] No NUMA configuration found Replace with a comparison that can only result in: 1, 0, -1. Fixes: 8f012db27c95 ("x86/numa: Introduce numa_fill_memblks()") Signed-off-by: Alison Schofield Acked-by: Dave Hansen Reviewed-by: Dan Williams Link: https://lore.kernel.org/r/99dcb3ae87e04995e9f293f6158dc8fa0749a487.1705085543.git.alison.schofield@intel.com Signed-off-by: Dan Williams --- arch/x86/mm/numa.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 8ada9bbfad583f..65e9a6e391c046 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -934,7 +934,7 @@ static int __init cmp_memblk(const void *a, const void *b) const struct numa_memblk *ma = *(const struct numa_memblk **)a; const struct numa_memblk *mb = *(const struct numa_memblk **)b; - return ma->start - mb->start; + return (ma->start > mb->start) - (ma->start < mb->start); } static struct numa_memblk *numa_memblk_list[NR_NODE_MEMBLKS] __initdata; From 17525952fa834a75751f51726eb3cd683948b148 Mon Sep 17 00:00:00 2001 From: Shyam Prasad N Date: Mon, 29 Jan 2024 13:58:13 +0000 Subject: [PATCH 326/707] cifs: make sure that channel scaling is done only once Following a successful cifs_tree_connect, we have the code to scale up/down the number of channels in the session. However, it is not protected by a lock today. As a result, this code can be executed by several processes that select the same channel. The core functions handle this well, as they pick chan_lock. However, we've seen cases where smb2_reconnect throws some warnings. To fix that, this change introduces a flags bitmap inside the cifs_ses structure. A new flag type is used to ensure that only one process enters this section at any time. Signed-off-by: Shyam Prasad N Signed-off-by: Steve French --- fs/smb/client/cifsglob.h | 3 +++ fs/smb/client/smb2pdu.c | 17 ++++++++++++++--- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index 16befff4cbb47c..9093c507042fa1 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -1032,6 +1032,8 @@ struct cifs_chan { __u8 signkey[SMB3_SIGN_KEY_SIZE]; }; +#define CIFS_SES_FLAG_SCALE_CHANNELS (0x1) + /* * Session structure. One of these for each uid session with a particular host */ @@ -1064,6 +1066,7 @@ struct cifs_ses { enum securityEnum sectype; /* what security flavor was specified? */ bool sign; /* is signing required? */ bool domainAuto:1; + unsigned int flags; __u16 session_flags; __u8 smb3signingkey[SMB3_SIGN_KEY_SIZE]; __u8 smb3encryptionkey[SMB3_ENC_DEC_KEY_SIZE]; diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index 86f6f35b7f32e8..273e24f9da1316 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -399,6 +399,12 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon, goto out; } + spin_lock(&ses->ses_lock); + if (ses->flags & CIFS_SES_FLAG_SCALE_CHANNELS) + goto skip_add_channels; + ses->flags |= CIFS_SES_FLAG_SCALE_CHANNELS; + spin_unlock(&ses->ses_lock); + if (!rc && (server->capabilities & SMB2_GLOBAL_CAP_MULTI_CHANNEL)) { mutex_unlock(&ses->session_mutex); @@ -428,17 +434,22 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon, if (ses->chan_max > ses->chan_count && ses->iface_count && !SERVER_IS_CHAN(server)) { - if (ses->chan_count == 1) + if (ses->chan_count == 1) { cifs_server_dbg(VFS, "supports multichannel now\n"); + queue_delayed_work(cifsiod_wq, &tcon->query_interfaces, + (SMB_INTERFACE_POLL_INTERVAL * HZ)); + } cifs_try_adding_channels(ses); - queue_delayed_work(cifsiod_wq, &tcon->query_interfaces, - (SMB_INTERFACE_POLL_INTERVAL * HZ)); } } else { mutex_unlock(&ses->session_mutex); } + skip_add_channels: + spin_lock(&ses->ses_lock); + ses->flags &= ~CIFS_SES_FLAG_SCALE_CHANNELS; + spin_unlock(&ses->ses_lock); if (smb2_command != SMB2_INTERNAL_CMD) mod_delayed_work(cifsiod_wq, &server->reconnect, 0); From 4f6bb3af2346530c53a2ef727f2b682558c2ee8f Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Mon, 29 Jan 2024 21:04:44 -0300 Subject: [PATCH 327/707] smb: client: increase number of PDUs allowed in a compound request With the introduction of SMB2_OP_QUERY_WSL_EA, the client may now send 5 commands in a single compound request in order to query xattrs from potential WSL reparse points, which should be fine as we currently allow up to 5 PDUs in a single compound request. However, if encryption is enabled (e.g. 'seal' mount option) or enforced by the server, current MAX_COMPOUND(5) won't be enough as we require an extra PDU for the transform header. Fix this by increasing MAX_COMPOUND to 7 and, while we're at it, add an WARN_ON_ONCE() and return -EIO instead of -ENOMEM in case we attempt to send a compound request that couldn't include the extra transform header. Signed-off-by: Paulo Alcantara Signed-off-by: Steve French --- fs/smb/client/cifsglob.h | 2 +- fs/smb/client/transport.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index 9093c507042fa1..c86a72c9d9ecd4 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -87,7 +87,7 @@ #define SMB_INTERFACE_POLL_INTERVAL 600 /* maximum number of PDUs in one compound */ -#define MAX_COMPOUND 5 +#define MAX_COMPOUND 7 /* * Default number of credits to keep available for SMB3. diff --git a/fs/smb/client/transport.c b/fs/smb/client/transport.c index e00278fcfa4fa6..994d7019343297 100644 --- a/fs/smb/client/transport.c +++ b/fs/smb/client/transport.c @@ -435,8 +435,8 @@ smb_send_rqst(struct TCP_Server_Info *server, int num_rqst, if (!(flags & CIFS_TRANSFORM_REQ)) return __smb_send_rqst(server, num_rqst, rqst); - if (num_rqst > MAX_COMPOUND - 1) - return -ENOMEM; + if (WARN_ON_ONCE(num_rqst > MAX_COMPOUND - 1)) + return -EIO; if (!server->ops->init_transform_rq) { cifs_server_dbg(VFS, "Encryption requested but transform callback is missing\n"); From c7e65cd6e359372b857463041cfa3767c82c55f8 Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Sun, 21 Jan 2024 13:28:21 -0300 Subject: [PATCH 328/707] smb: client: introduce reparse mount option Allow the user to create special files and symlinks by choosing between WSL and NFS reparse points via 'reparse={nfs,wsl}' mount options. If unset or 'reparse=default', the client will default to creating them via NFS reparse points. Creating WSL reparse points isn't supported yet, so simply return error when attempting to mount with 'reparse=wsl' for now. Signed-off-by: Paulo Alcantara Signed-off-by: Steve French --- fs/smb/client/cifsglob.h | 6 ++++++ fs/smb/client/connect.c | 2 ++ fs/smb/client/fs_context.c | 35 +++++++++++++++++++++++++++++++++++ fs/smb/client/fs_context.h | 9 +++++++++ 4 files changed, 52 insertions(+) diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index c86a72c9d9ecd4..b633f08f40f548 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -153,6 +153,12 @@ enum securityEnum { Kerberos, /* Kerberos via SPNEGO */ }; +enum cifs_reparse_type { + CIFS_REPARSE_TYPE_NFS, + CIFS_REPARSE_TYPE_WSL, + CIFS_REPARSE_TYPE_DEFAULT = CIFS_REPARSE_TYPE_NFS, +}; + struct session_key { unsigned int len; char *response; diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c index bfd568f8971056..c5cf88de32b732 100644 --- a/fs/smb/client/connect.c +++ b/fs/smb/client/connect.c @@ -2797,6 +2797,8 @@ compare_mount_options(struct super_block *sb, struct cifs_mnt_data *mnt_data) return 0; if (old->ctx->closetimeo != new->ctx->closetimeo) return 0; + if (old->ctx->reparse_type != new->ctx->reparse_type) + return 0; return 1; } diff --git a/fs/smb/client/fs_context.c b/fs/smb/client/fs_context.c index 52cbef2eeb28f6..aee8be2cae4676 100644 --- a/fs/smb/client/fs_context.c +++ b/fs/smb/client/fs_context.c @@ -174,6 +174,7 @@ const struct fs_parameter_spec smb3_fs_parameters[] = { fsparam_string("vers", Opt_vers), fsparam_string("sec", Opt_sec), fsparam_string("cache", Opt_cache), + fsparam_string("reparse", Opt_reparse), /* Arguments that should be ignored */ fsparam_flag("guest", Opt_ignore), @@ -296,6 +297,35 @@ cifs_parse_cache_flavor(struct fs_context *fc, char *value, struct smb3_fs_conte return 0; } +static const match_table_t reparse_flavor_tokens = { + { Opt_reparse_default, "default" }, + { Opt_reparse_nfs, "nfs" }, + { Opt_reparse_wsl, "wsl" }, + { Opt_reparse_err, NULL }, +}; + +static int parse_reparse_flavor(struct fs_context *fc, char *value, + struct smb3_fs_context *ctx) +{ + substring_t args[MAX_OPT_ARGS]; + + switch (match_token(value, reparse_flavor_tokens, args)) { + case Opt_reparse_default: + ctx->reparse_type = CIFS_REPARSE_TYPE_DEFAULT; + break; + case Opt_reparse_nfs: + ctx->reparse_type = CIFS_REPARSE_TYPE_NFS; + break; + case Opt_reparse_wsl: + cifs_errorf(fc, "unsupported reparse= option: %s\n", value); + return 1; + default: + cifs_errorf(fc, "bad reparse= option: %s\n", value); + return 1; + } + return 0; +} + #define DUP_CTX_STR(field) \ do { \ if (ctx->field) { \ @@ -1538,6 +1568,10 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, case Opt_rdma: ctx->rdma = true; break; + case Opt_reparse: + if (parse_reparse_flavor(fc, param->string, ctx)) + goto cifs_parse_mount_err; + break; } /* case Opt_ignore: - is ignored as expected ... */ @@ -1624,6 +1658,7 @@ int smb3_init_fs_context(struct fs_context *fc) ctx->backupgid_specified = false; /* no backup intent for a group */ ctx->retrans = 1; + ctx->reparse_type = CIFS_REPARSE_TYPE_DEFAULT; /* * short int override_uid = -1; diff --git a/fs/smb/client/fs_context.h b/fs/smb/client/fs_context.h index 182ce11cbe9362..1f09754977e7cc 100644 --- a/fs/smb/client/fs_context.h +++ b/fs/smb/client/fs_context.h @@ -41,6 +41,13 @@ enum { Opt_cache_err }; +enum cifs_reparse_parm { + Opt_reparse_default, + Opt_reparse_nfs, + Opt_reparse_wsl, + Opt_reparse_err +}; + enum cifs_sec_param { Opt_sec_krb5, Opt_sec_krb5i, @@ -148,6 +155,7 @@ enum cifs_param { Opt_vers, Opt_sec, Opt_cache, + Opt_reparse, /* Mount options to be ignored */ Opt_ignore, @@ -271,6 +279,7 @@ struct smb3_fs_context { char *leaf_fullpath; struct cifs_ses *dfs_root_ses; bool dfs_automount:1; /* set for dfs automount only */ + enum cifs_reparse_type reparse_type; }; extern const struct fs_parameter_spec smb3_fs_parameters[]; From 50e429b8523a8b84f2563fa87dde52285bfacbc7 Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Sun, 21 Jan 2024 19:00:44 -0300 Subject: [PATCH 329/707] smb: client: move most of reparse point handling code to common file In preparation to add support for creating special files also via WSL reparse points in next commits. Signed-off-by: Paulo Alcantara Signed-off-by: Steve French --- fs/smb/client/Makefile | 2 +- fs/smb/client/cifsglob.h | 13 -- fs/smb/client/cifsproto.h | 4 - fs/smb/client/inode.c | 79 +--------- fs/smb/client/readdir.c | 18 +-- fs/smb/client/reparse.c | 316 ++++++++++++++++++++++++++++++++++++++ fs/smb/client/reparse.h | 73 +++++++++ fs/smb/client/smb2ops.c | 250 +----------------------------- fs/smb/client/smb2proto.h | 6 + 9 files changed, 401 insertions(+), 360 deletions(-) create mode 100644 fs/smb/client/reparse.c create mode 100644 fs/smb/client/reparse.h diff --git a/fs/smb/client/Makefile b/fs/smb/client/Makefile index 0b07eb94c93b38..e11985f2460b26 100644 --- a/fs/smb/client/Makefile +++ b/fs/smb/client/Makefile @@ -12,7 +12,7 @@ cifs-y := trace.o cifsfs.o cifs_debug.o connect.o dir.o file.o \ smb2ops.o smb2maperror.o smb2transport.o \ smb2misc.o smb2pdu.o smb2inode.o smb2file.o cifsacl.o fs_context.o \ dns_resolve.o cifs_spnego_negtokeninit.asn1.o asn1.o \ - namespace.o + namespace.o reparse.o $(obj)/asn1.o: $(obj)/cifs_spnego_negtokeninit.asn1.h diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index b633f08f40f548..a4aa95439e18c6 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -223,19 +223,6 @@ struct cifs_open_info_data { }; }; -static inline bool cifs_open_data_reparse(struct cifs_open_info_data *data) -{ - struct smb2_file_all_info *fi = &data->fi; - u32 attrs = le32_to_cpu(fi->Attributes); - bool ret; - - ret = data->reparse_point || (attrs & ATTR_REPARSE); - if (ret) - attrs |= ATTR_REPARSE; - fi->Attributes = cpu_to_le32(attrs); - return ret; -} - /* ***************************************************************** * Except the CIFS PDUs themselves all the diff --git a/fs/smb/client/cifsproto.h b/fs/smb/client/cifsproto.h index a841bf4967fa4d..770db902685013 100644 --- a/fs/smb/client/cifsproto.h +++ b/fs/smb/client/cifsproto.h @@ -208,10 +208,6 @@ extern struct inode *cifs_iget(struct super_block *sb, int cifs_get_inode_info(struct inode **inode, const char *full_path, struct cifs_open_info_data *data, struct super_block *sb, int xid, const struct cifs_fid *fid); -bool cifs_reparse_point_to_fattr(struct cifs_sb_info *cifs_sb, - struct cifs_fattr *fattr, - struct cifs_open_info_data *data); - extern int smb311_posix_get_inode_info(struct inode **inode, const char *full_path, struct cifs_open_info_data *data, diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c index d02f8ba29cb5bf..56d77ff8249d50 100644 --- a/fs/smb/client/inode.c +++ b/fs/smb/client/inode.c @@ -26,6 +26,7 @@ #include "fs_context.h" #include "cifs_ioctl.h" #include "cached_dir.h" +#include "reparse.h" static void cifs_set_ops(struct inode *inode) { @@ -727,84 +728,6 @@ static void smb311_posix_info_to_fattr(struct cifs_fattr *fattr, fattr->cf_mode, fattr->cf_uniqueid, fattr->cf_nlink); } -static inline dev_t nfs_mkdev(struct reparse_posix_data *buf) -{ - u64 v = le64_to_cpu(*(__le64 *)buf->DataBuffer); - - return MKDEV(v >> 32, v & 0xffffffff); -} - -bool cifs_reparse_point_to_fattr(struct cifs_sb_info *cifs_sb, - struct cifs_fattr *fattr, - struct cifs_open_info_data *data) -{ - struct reparse_posix_data *buf = data->reparse.posix; - u32 tag = data->reparse.tag; - - if (tag == IO_REPARSE_TAG_NFS && buf) { - switch (le64_to_cpu(buf->InodeType)) { - case NFS_SPECFILE_CHR: - fattr->cf_mode |= S_IFCHR; - fattr->cf_dtype = DT_CHR; - fattr->cf_rdev = nfs_mkdev(buf); - break; - case NFS_SPECFILE_BLK: - fattr->cf_mode |= S_IFBLK; - fattr->cf_dtype = DT_BLK; - fattr->cf_rdev = nfs_mkdev(buf); - break; - case NFS_SPECFILE_FIFO: - fattr->cf_mode |= S_IFIFO; - fattr->cf_dtype = DT_FIFO; - break; - case NFS_SPECFILE_SOCK: - fattr->cf_mode |= S_IFSOCK; - fattr->cf_dtype = DT_SOCK; - break; - case NFS_SPECFILE_LNK: - fattr->cf_mode |= S_IFLNK; - fattr->cf_dtype = DT_LNK; - break; - default: - WARN_ON_ONCE(1); - return false; - } - return true; - } - - switch (tag) { - case IO_REPARSE_TAG_LX_SYMLINK: - fattr->cf_mode |= S_IFLNK; - fattr->cf_dtype = DT_LNK; - break; - case IO_REPARSE_TAG_LX_FIFO: - fattr->cf_mode |= S_IFIFO; - fattr->cf_dtype = DT_FIFO; - break; - case IO_REPARSE_TAG_AF_UNIX: - fattr->cf_mode |= S_IFSOCK; - fattr->cf_dtype = DT_SOCK; - break; - case IO_REPARSE_TAG_LX_CHR: - fattr->cf_mode |= S_IFCHR; - fattr->cf_dtype = DT_CHR; - break; - case IO_REPARSE_TAG_LX_BLK: - fattr->cf_mode |= S_IFBLK; - fattr->cf_dtype = DT_BLK; - break; - case 0: /* SMB1 symlink */ - case IO_REPARSE_TAG_SYMLINK: - case IO_REPARSE_TAG_NFS: - fattr->cf_mode |= S_IFLNK; - fattr->cf_dtype = DT_LNK; - break; - default: - return false; - } - return true; -} - static void cifs_open_info_to_fattr(struct cifs_fattr *fattr, struct cifs_open_info_data *data, struct super_block *sb) diff --git a/fs/smb/client/readdir.c b/fs/smb/client/readdir.c index 3b1b01d10f7d7a..7ef5a4b37901db 100644 --- a/fs/smb/client/readdir.c +++ b/fs/smb/client/readdir.c @@ -22,6 +22,7 @@ #include "smb2proto.h" #include "fs_context.h" #include "cached_dir.h" +#include "reparse.h" /* * To be safe - for UCS to UTF-8 with strings loaded with the rare long @@ -55,23 +56,6 @@ static inline void dump_cifs_file_struct(struct file *file, char *label) } #endif /* DEBUG2 */ -/* - * Match a reparse point inode if reparse tag and ctime haven't changed. - * - * Windows Server updates ctime of reparse points when their data have changed. - * The server doesn't allow changing reparse tags from existing reparse points, - * though it's worth checking. - */ -static inline bool reparse_inode_match(struct inode *inode, - struct cifs_fattr *fattr) -{ - struct timespec64 ctime = inode_get_ctime(inode); - - return (CIFS_I(inode)->cifsAttrs & ATTR_REPARSE) && - CIFS_I(inode)->reparse_tag == fattr->cf_cifstag && - timespec64_equal(&ctime, &fattr->cf_ctime); -} - /* * Attempt to preload the dcache with the results from the FIND_FIRST/NEXT * diff --git a/fs/smb/client/reparse.c b/fs/smb/client/reparse.c new file mode 100644 index 00000000000000..5ce9d1ed5b8881 --- /dev/null +++ b/fs/smb/client/reparse.c @@ -0,0 +1,316 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2024 Paulo Alcantara + */ + +#include +#include +#include +#include "cifsglob.h" +#include "smb2proto.h" +#include "cifsproto.h" +#include "cifs_unicode.h" +#include "cifs_debug.h" +#include "reparse.h" + +int smb2_create_reparse_symlink(const unsigned int xid, struct inode *inode, + struct dentry *dentry, struct cifs_tcon *tcon, + const char *full_path, const char *symname) +{ + struct reparse_symlink_data_buffer *buf = NULL; + struct cifs_open_info_data data; + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct inode *new; + struct kvec iov; + __le16 *path; + char *sym; + u16 len, plen; + int rc = 0; + + sym = kstrdup(symname, GFP_KERNEL); + if (!sym) + return -ENOMEM; + + data = (struct cifs_open_info_data) { + .reparse_point = true, + .reparse = { .tag = IO_REPARSE_TAG_SYMLINK, }, + .symlink_target = sym, + }; + + path = cifs_convert_path_to_utf16(symname, cifs_sb); + if (!path) { + rc = -ENOMEM; + goto out; + } + + plen = 2 * UniStrnlen((wchar_t *)path, PATH_MAX); + len = sizeof(*buf) + plen * 2; + buf = kzalloc(len, GFP_KERNEL); + if (!buf) { + rc = -ENOMEM; + goto out; + } + + buf->ReparseTag = cpu_to_le32(IO_REPARSE_TAG_SYMLINK); + buf->ReparseDataLength = cpu_to_le16(len - sizeof(struct reparse_data_buffer)); + buf->SubstituteNameOffset = cpu_to_le16(plen); + buf->SubstituteNameLength = cpu_to_le16(plen); + memcpy(&buf->PathBuffer[plen], path, plen); + buf->PrintNameOffset = 0; + buf->PrintNameLength = cpu_to_le16(plen); + memcpy(buf->PathBuffer, path, plen); + buf->Flags = cpu_to_le32(*symname != '/' ? SYMLINK_FLAG_RELATIVE : 0); + + iov.iov_base = buf; + iov.iov_len = len; + new = smb2_get_reparse_inode(&data, inode->i_sb, xid, + tcon, full_path, &iov); + if (!IS_ERR(new)) + d_instantiate(dentry, new); + else + rc = PTR_ERR(new); +out: + kfree(path); + cifs_free_open_info(&data); + kfree(buf); + return rc; +} + +static int nfs_set_reparse_buf(struct reparse_posix_data *buf, + mode_t mode, dev_t dev, + struct kvec *iov) +{ + u64 type; + u16 len, dlen; + + len = sizeof(*buf); + + switch ((type = reparse_mode_nfs_type(mode))) { + case NFS_SPECFILE_BLK: + case NFS_SPECFILE_CHR: + dlen = sizeof(__le64); + break; + case NFS_SPECFILE_FIFO: + case NFS_SPECFILE_SOCK: + dlen = 0; + break; + default: + return -EOPNOTSUPP; + } + + buf->ReparseTag = cpu_to_le32(IO_REPARSE_TAG_NFS); + buf->Reserved = 0; + buf->InodeType = cpu_to_le64(type); + buf->ReparseDataLength = cpu_to_le16(len + dlen - + sizeof(struct reparse_data_buffer)); + *(__le64 *)buf->DataBuffer = cpu_to_le64(((u64)MAJOR(dev) << 32) | + MINOR(dev)); + iov->iov_base = buf; + iov->iov_len = len + dlen; + return 0; +} + +int smb2_make_nfs_node(unsigned int xid, struct inode *inode, + struct dentry *dentry, struct cifs_tcon *tcon, + const char *full_path, umode_t mode, dev_t dev) +{ + struct cifs_open_info_data data; + struct reparse_posix_data *p; + struct inode *new; + struct kvec iov; + __u8 buf[sizeof(*p) + sizeof(__le64)]; + int rc; + + p = (struct reparse_posix_data *)buf; + rc = nfs_set_reparse_buf(p, mode, dev, &iov); + if (rc) + return rc; + + data = (struct cifs_open_info_data) { + .reparse_point = true, + .reparse = { .tag = IO_REPARSE_TAG_NFS, .posix = p, }, + }; + + new = smb2_get_reparse_inode(&data, inode->i_sb, xid, + tcon, full_path, &iov); + if (!IS_ERR(new)) + d_instantiate(dentry, new); + else + rc = PTR_ERR(new); + cifs_free_open_info(&data); + return rc; +} + +/* See MS-FSCC 2.1.2.6 for the 'NFS' style reparse tags */ +static int parse_reparse_posix(struct reparse_posix_data *buf, + struct cifs_sb_info *cifs_sb, + struct cifs_open_info_data *data) +{ + unsigned int len; + u64 type; + + switch ((type = le64_to_cpu(buf->InodeType))) { + case NFS_SPECFILE_LNK: + len = le16_to_cpu(buf->ReparseDataLength); + data->symlink_target = cifs_strndup_from_utf16(buf->DataBuffer, + len, true, + cifs_sb->local_nls); + if (!data->symlink_target) + return -ENOMEM; + convert_delimiter(data->symlink_target, '/'); + cifs_dbg(FYI, "%s: target path: %s\n", + __func__, data->symlink_target); + break; + case NFS_SPECFILE_CHR: + case NFS_SPECFILE_BLK: + case NFS_SPECFILE_FIFO: + case NFS_SPECFILE_SOCK: + break; + default: + cifs_dbg(VFS, "%s: unhandled inode type: 0x%llx\n", + __func__, type); + return -EOPNOTSUPP; + } + return 0; +} + +static int parse_reparse_symlink(struct reparse_symlink_data_buffer *sym, + u32 plen, bool unicode, + struct cifs_sb_info *cifs_sb, + struct cifs_open_info_data *data) +{ + unsigned int len; + unsigned int offs; + + /* We handle Symbolic Link reparse tag here. See: MS-FSCC 2.1.2.4 */ + + offs = le16_to_cpu(sym->SubstituteNameOffset); + len = le16_to_cpu(sym->SubstituteNameLength); + if (offs + 20 > plen || offs + len + 20 > plen) { + cifs_dbg(VFS, "srv returned malformed symlink buffer\n"); + return -EIO; + } + + data->symlink_target = cifs_strndup_from_utf16(sym->PathBuffer + offs, + len, unicode, + cifs_sb->local_nls); + if (!data->symlink_target) + return -ENOMEM; + + convert_delimiter(data->symlink_target, '/'); + cifs_dbg(FYI, "%s: target path: %s\n", __func__, data->symlink_target); + + return 0; +} + +int parse_reparse_point(struct reparse_data_buffer *buf, + u32 plen, struct cifs_sb_info *cifs_sb, + bool unicode, struct cifs_open_info_data *data) +{ + data->reparse.buf = buf; + + /* See MS-FSCC 2.1.2 */ + switch (le32_to_cpu(buf->ReparseTag)) { + case IO_REPARSE_TAG_NFS: + return parse_reparse_posix((struct reparse_posix_data *)buf, + cifs_sb, data); + case IO_REPARSE_TAG_SYMLINK: + return parse_reparse_symlink( + (struct reparse_symlink_data_buffer *)buf, + plen, unicode, cifs_sb, data); + case IO_REPARSE_TAG_LX_SYMLINK: + case IO_REPARSE_TAG_AF_UNIX: + case IO_REPARSE_TAG_LX_FIFO: + case IO_REPARSE_TAG_LX_CHR: + case IO_REPARSE_TAG_LX_BLK: + return 0; + default: + cifs_dbg(VFS, "%s: unhandled reparse tag: 0x%08x\n", + __func__, le32_to_cpu(buf->ReparseTag)); + return -EOPNOTSUPP; + } +} + +int smb2_parse_reparse_point(struct cifs_sb_info *cifs_sb, + struct kvec *rsp_iov, + struct cifs_open_info_data *data) +{ + struct reparse_data_buffer *buf; + struct smb2_ioctl_rsp *io = rsp_iov->iov_base; + u32 plen = le32_to_cpu(io->OutputCount); + + buf = (struct reparse_data_buffer *)((u8 *)io + + le32_to_cpu(io->OutputOffset)); + return parse_reparse_point(buf, plen, cifs_sb, true, data); +} + +bool cifs_reparse_point_to_fattr(struct cifs_sb_info *cifs_sb, + struct cifs_fattr *fattr, + struct cifs_open_info_data *data) +{ + struct reparse_posix_data *buf = data->reparse.posix; + u32 tag = data->reparse.tag; + + if (tag == IO_REPARSE_TAG_NFS && buf) { + switch (le64_to_cpu(buf->InodeType)) { + case NFS_SPECFILE_CHR: + fattr->cf_mode |= S_IFCHR; + fattr->cf_dtype = DT_CHR; + fattr->cf_rdev = reparse_nfs_mkdev(buf); + break; + case NFS_SPECFILE_BLK: + fattr->cf_mode |= S_IFBLK; + fattr->cf_dtype = DT_BLK; + fattr->cf_rdev = reparse_nfs_mkdev(buf); + break; + case NFS_SPECFILE_FIFO: + fattr->cf_mode |= S_IFIFO; + fattr->cf_dtype = DT_FIFO; + break; + case NFS_SPECFILE_SOCK: + fattr->cf_mode |= S_IFSOCK; + fattr->cf_dtype = DT_SOCK; + break; + case NFS_SPECFILE_LNK: + fattr->cf_mode |= S_IFLNK; + fattr->cf_dtype = DT_LNK; + break; + default: + WARN_ON_ONCE(1); + return false; + } + return true; + } + + switch (tag) { + case IO_REPARSE_TAG_LX_SYMLINK: + fattr->cf_mode |= S_IFLNK; + fattr->cf_dtype = DT_LNK; + break; + case IO_REPARSE_TAG_LX_FIFO: + fattr->cf_mode |= S_IFIFO; + fattr->cf_dtype = DT_FIFO; + break; + case IO_REPARSE_TAG_AF_UNIX: + fattr->cf_mode |= S_IFSOCK; + fattr->cf_dtype = DT_SOCK; + break; + case IO_REPARSE_TAG_LX_CHR: + fattr->cf_mode |= S_IFCHR; + fattr->cf_dtype = DT_CHR; + break; + case IO_REPARSE_TAG_LX_BLK: + fattr->cf_mode |= S_IFBLK; + fattr->cf_dtype = DT_BLK; + break; + case 0: /* SMB1 symlink */ + case IO_REPARSE_TAG_SYMLINK: + case IO_REPARSE_TAG_NFS: + fattr->cf_mode |= S_IFLNK; + fattr->cf_dtype = DT_LNK; + break; + default: + return false; + } + return true; +} diff --git a/fs/smb/client/reparse.h b/fs/smb/client/reparse.h new file mode 100644 index 00000000000000..3ceb90da0df901 --- /dev/null +++ b/fs/smb/client/reparse.h @@ -0,0 +1,73 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024 Paulo Alcantara + */ + +#ifndef _CIFS_REPARSE_H +#define _CIFS_REPARSE_H + +#include +#include +#include "cifsglob.h" + +static inline dev_t reparse_nfs_mkdev(struct reparse_posix_data *buf) +{ + u64 v = le64_to_cpu(*(__le64 *)buf->DataBuffer); + + return MKDEV(v >> 32, v & 0xffffffff); +} + +static inline u64 reparse_mode_nfs_type(mode_t mode) +{ + switch (mode & S_IFMT) { + case S_IFBLK: return NFS_SPECFILE_BLK; + case S_IFCHR: return NFS_SPECFILE_CHR; + case S_IFIFO: return NFS_SPECFILE_FIFO; + case S_IFSOCK: return NFS_SPECFILE_SOCK; + } + return 0; +} + +/* + * Match a reparse point inode if reparse tag and ctime haven't changed. + * + * Windows Server updates ctime of reparse points when their data have changed. + * The server doesn't allow changing reparse tags from existing reparse points, + * though it's worth checking. + */ +static inline bool reparse_inode_match(struct inode *inode, + struct cifs_fattr *fattr) +{ + struct timespec64 ctime = inode_get_ctime(inode); + + return (CIFS_I(inode)->cifsAttrs & ATTR_REPARSE) && + CIFS_I(inode)->reparse_tag == fattr->cf_cifstag && + timespec64_equal(&ctime, &fattr->cf_ctime); +} + +static inline bool cifs_open_data_reparse(struct cifs_open_info_data *data) +{ + struct smb2_file_all_info *fi = &data->fi; + u32 attrs = le32_to_cpu(fi->Attributes); + bool ret; + + ret = data->reparse_point || (attrs & ATTR_REPARSE); + if (ret) + attrs |= ATTR_REPARSE; + fi->Attributes = cpu_to_le32(attrs); + return ret; +} + +bool cifs_reparse_point_to_fattr(struct cifs_sb_info *cifs_sb, + struct cifs_fattr *fattr, + struct cifs_open_info_data *data); +int smb2_create_reparse_symlink(const unsigned int xid, struct inode *inode, + struct dentry *dentry, struct cifs_tcon *tcon, + const char *full_path, const char *symname); +int smb2_make_nfs_node(unsigned int xid, struct inode *inode, + struct dentry *dentry, struct cifs_tcon *tcon, + const char *full_path, umode_t mode, dev_t dev); +int smb2_parse_reparse_point(struct cifs_sb_info *cifs_sb, struct kvec *rsp_iov, + struct cifs_open_info_data *data); + +#endif /* _CIFS_REPARSE_H */ diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index 83c898afc8354b..aee0cbf01e6cdc 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -28,6 +28,7 @@ #include "fscache.h" #include "fs_context.h" #include "cached_dir.h" +#include "reparse.h" /* Change credits for different ops and return the total number of credits */ static int @@ -2982,109 +2983,6 @@ smb2_get_dfs_refer(const unsigned int xid, struct cifs_ses *ses, return rc; } -/* See MS-FSCC 2.1.2.6 for the 'NFS' style reparse tags */ -static int parse_reparse_posix(struct reparse_posix_data *buf, - struct cifs_sb_info *cifs_sb, - struct cifs_open_info_data *data) -{ - unsigned int len; - u64 type; - - switch ((type = le64_to_cpu(buf->InodeType))) { - case NFS_SPECFILE_LNK: - len = le16_to_cpu(buf->ReparseDataLength); - data->symlink_target = cifs_strndup_from_utf16(buf->DataBuffer, - len, true, - cifs_sb->local_nls); - if (!data->symlink_target) - return -ENOMEM; - convert_delimiter(data->symlink_target, '/'); - cifs_dbg(FYI, "%s: target path: %s\n", - __func__, data->symlink_target); - break; - case NFS_SPECFILE_CHR: - case NFS_SPECFILE_BLK: - case NFS_SPECFILE_FIFO: - case NFS_SPECFILE_SOCK: - break; - default: - cifs_dbg(VFS, "%s: unhandled inode type: 0x%llx\n", - __func__, type); - return -EOPNOTSUPP; - } - return 0; -} - -static int parse_reparse_symlink(struct reparse_symlink_data_buffer *sym, - u32 plen, bool unicode, - struct cifs_sb_info *cifs_sb, - struct cifs_open_info_data *data) -{ - unsigned int len; - unsigned int offs; - - /* We handle Symbolic Link reparse tag here. See: MS-FSCC 2.1.2.4 */ - - offs = le16_to_cpu(sym->SubstituteNameOffset); - len = le16_to_cpu(sym->SubstituteNameLength); - if (offs + 20 > plen || offs + len + 20 > plen) { - cifs_dbg(VFS, "srv returned malformed symlink buffer\n"); - return -EIO; - } - - data->symlink_target = cifs_strndup_from_utf16(sym->PathBuffer + offs, - len, unicode, - cifs_sb->local_nls); - if (!data->symlink_target) - return -ENOMEM; - - convert_delimiter(data->symlink_target, '/'); - cifs_dbg(FYI, "%s: target path: %s\n", __func__, data->symlink_target); - - return 0; -} - -int parse_reparse_point(struct reparse_data_buffer *buf, - u32 plen, struct cifs_sb_info *cifs_sb, - bool unicode, struct cifs_open_info_data *data) -{ - data->reparse.buf = buf; - - /* See MS-FSCC 2.1.2 */ - switch (le32_to_cpu(buf->ReparseTag)) { - case IO_REPARSE_TAG_NFS: - return parse_reparse_posix((struct reparse_posix_data *)buf, - cifs_sb, data); - case IO_REPARSE_TAG_SYMLINK: - return parse_reparse_symlink( - (struct reparse_symlink_data_buffer *)buf, - plen, unicode, cifs_sb, data); - case IO_REPARSE_TAG_LX_SYMLINK: - case IO_REPARSE_TAG_AF_UNIX: - case IO_REPARSE_TAG_LX_FIFO: - case IO_REPARSE_TAG_LX_CHR: - case IO_REPARSE_TAG_LX_BLK: - return 0; - default: - cifs_dbg(VFS, "%s: unhandled reparse tag: 0x%08x\n", - __func__, le32_to_cpu(buf->ReparseTag)); - return -EOPNOTSUPP; - } -} - -static int smb2_parse_reparse_point(struct cifs_sb_info *cifs_sb, - struct kvec *rsp_iov, - struct cifs_open_info_data *data) -{ - struct reparse_data_buffer *buf; - struct smb2_ioctl_rsp *io = rsp_iov->iov_base; - u32 plen = le32_to_cpu(io->OutputCount); - - buf = (struct reparse_data_buffer *)((u8 *)io + - le32_to_cpu(io->OutputOffset)); - return parse_reparse_point(buf, plen, cifs_sb, true, data); -} - static struct cifs_ntsd * get_smb2_acl_by_fid(struct cifs_sb_info *cifs_sb, const struct cifs_fid *cifsfid, u32 *pacllen, u32 info) @@ -5124,148 +5022,6 @@ int cifs_sfu_make_node(unsigned int xid, struct inode *inode, return rc; } -static inline u64 mode_nfs_type(mode_t mode) -{ - switch (mode & S_IFMT) { - case S_IFBLK: return NFS_SPECFILE_BLK; - case S_IFCHR: return NFS_SPECFILE_CHR; - case S_IFIFO: return NFS_SPECFILE_FIFO; - case S_IFSOCK: return NFS_SPECFILE_SOCK; - } - return 0; -} - -static int nfs_set_reparse_buf(struct reparse_posix_data *buf, - mode_t mode, dev_t dev, - struct kvec *iov) -{ - u64 type; - u16 len, dlen; - - len = sizeof(*buf); - - switch ((type = mode_nfs_type(mode))) { - case NFS_SPECFILE_BLK: - case NFS_SPECFILE_CHR: - dlen = sizeof(__le64); - break; - case NFS_SPECFILE_FIFO: - case NFS_SPECFILE_SOCK: - dlen = 0; - break; - default: - return -EOPNOTSUPP; - } - - buf->ReparseTag = cpu_to_le32(IO_REPARSE_TAG_NFS); - buf->Reserved = 0; - buf->InodeType = cpu_to_le64(type); - buf->ReparseDataLength = cpu_to_le16(len + dlen - - sizeof(struct reparse_data_buffer)); - *(__le64 *)buf->DataBuffer = cpu_to_le64(((u64)MAJOR(dev) << 32) | - MINOR(dev)); - iov->iov_base = buf; - iov->iov_len = len + dlen; - return 0; -} - -static int nfs_make_node(unsigned int xid, struct inode *inode, - struct dentry *dentry, struct cifs_tcon *tcon, - const char *full_path, umode_t mode, dev_t dev) -{ - struct cifs_open_info_data data; - struct reparse_posix_data *p; - struct inode *new; - struct kvec iov; - __u8 buf[sizeof(*p) + sizeof(__le64)]; - int rc; - - p = (struct reparse_posix_data *)buf; - rc = nfs_set_reparse_buf(p, mode, dev, &iov); - if (rc) - return rc; - - data = (struct cifs_open_info_data) { - .reparse_point = true, - .reparse = { .tag = IO_REPARSE_TAG_NFS, .posix = p, }, - }; - - new = smb2_get_reparse_inode(&data, inode->i_sb, xid, - tcon, full_path, &iov); - if (!IS_ERR(new)) - d_instantiate(dentry, new); - else - rc = PTR_ERR(new); - cifs_free_open_info(&data); - return rc; -} - -static int smb2_create_reparse_symlink(const unsigned int xid, - struct inode *inode, - struct dentry *dentry, - struct cifs_tcon *tcon, - const char *full_path, - const char *symname) -{ - struct reparse_symlink_data_buffer *buf = NULL; - struct cifs_open_info_data data; - struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); - struct inode *new; - struct kvec iov; - __le16 *path; - char *sym; - u16 len, plen; - int rc = 0; - - sym = kstrdup(symname, GFP_KERNEL); - if (!sym) - return -ENOMEM; - - data = (struct cifs_open_info_data) { - .reparse_point = true, - .reparse = { .tag = IO_REPARSE_TAG_SYMLINK, }, - .symlink_target = sym, - }; - - path = cifs_convert_path_to_utf16(symname, cifs_sb); - if (!path) { - rc = -ENOMEM; - goto out; - } - - plen = 2 * UniStrnlen((wchar_t *)path, PATH_MAX); - len = sizeof(*buf) + plen * 2; - buf = kzalloc(len, GFP_KERNEL); - if (!buf) { - rc = -ENOMEM; - goto out; - } - - buf->ReparseTag = cpu_to_le32(IO_REPARSE_TAG_SYMLINK); - buf->ReparseDataLength = cpu_to_le16(len - sizeof(struct reparse_data_buffer)); - buf->SubstituteNameOffset = cpu_to_le16(plen); - buf->SubstituteNameLength = cpu_to_le16(plen); - memcpy(&buf->PathBuffer[plen], path, plen); - buf->PrintNameOffset = 0; - buf->PrintNameLength = cpu_to_le16(plen); - memcpy(buf->PathBuffer, path, plen); - buf->Flags = cpu_to_le32(*symname != '/' ? SYMLINK_FLAG_RELATIVE : 0); - - iov.iov_base = buf; - iov.iov_len = len; - new = smb2_get_reparse_inode(&data, inode->i_sb, xid, - tcon, full_path, &iov); - if (!IS_ERR(new)) - d_instantiate(dentry, new); - else - rc = PTR_ERR(new); -out: - kfree(path); - cifs_free_open_info(&data); - kfree(buf); - return rc; -} - static int smb2_make_node(unsigned int xid, struct inode *inode, struct dentry *dentry, struct cifs_tcon *tcon, const char *full_path, umode_t mode, dev_t dev) @@ -5283,8 +5039,8 @@ static int smb2_make_node(unsigned int xid, struct inode *inode, rc = cifs_sfu_make_node(xid, inode, dentry, tcon, full_path, mode, dev); } else { - rc = nfs_make_node(xid, inode, dentry, tcon, - full_path, mode, dev); + rc = smb2_make_nfs_node(xid, inode, dentry, tcon, + full_path, mode, dev); } return rc; } diff --git a/fs/smb/client/smb2proto.h b/fs/smb/client/smb2proto.h index b3069911e9dd8f..5322b769bca21c 100644 --- a/fs/smb/client/smb2proto.h +++ b/fs/smb/client/smb2proto.h @@ -308,5 +308,11 @@ int smb311_posix_query_path_info(const unsigned int xid, int posix_info_parse(const void *beg, const void *end, struct smb2_posix_info_parsed *out); int posix_info_sid_size(const void *beg, const void *end); +int smb2_create_reparse_symlink(const unsigned int xid, struct inode *inode, + struct dentry *dentry, struct cifs_tcon *tcon, + const char *full_path, const char *symname); +int smb2_make_nfs_node(unsigned int xid, struct inode *inode, + struct dentry *dentry, struct cifs_tcon *tcon, + const char *full_path, umode_t mode, dev_t dev); #endif /* _SMB2PROTO_H */ From e0954d76ef62711d17b51c25c73ed8f1d3def2f5 Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Thu, 25 Jan 2024 17:04:05 -0300 Subject: [PATCH 330/707] smb: client: fix potential broken compound request Now that smb2_compound_op() can accept up to 5 commands in a single compound request, set the appropriate NextCommand and related flags to all subsequent commands as well as handling the case where a valid @cfile is passed and therefore skipping create and close requests in the compound chain. This fix a potential broken compound request that could be sent from smb2_get_reparse_inode() if the client found a valid open file (@cfile) prior to calling smb2_compound_op(). Signed-off-by: Paulo Alcantara Signed-off-by: Steve French --- fs/smb/client/smb2inode.c | 106 ++++++++++++++++++++++---------------- 1 file changed, 63 insertions(+), 43 deletions(-) diff --git a/fs/smb/client/smb2inode.c b/fs/smb/client/smb2inode.c index 05818cd6d932e9..11de1318d8e0f5 100644 --- a/fs/smb/client/smb2inode.c +++ b/fs/smb/client/smb2inode.c @@ -202,14 +202,13 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, SMB2_O_INFO_FILE, 0, sizeof(struct smb2_file_all_info) + PATH_MAX * 2, 0, NULL); - if (!rc) { - smb2_set_next_command(tcon, &rqst[num_rqst]); - smb2_set_related(&rqst[num_rqst]); - } } - - if (rc) + if (!rc && (!cfile || num_rqst > 1)) { + smb2_set_next_command(tcon, &rqst[num_rqst]); + smb2_set_related(&rqst[num_rqst]); + } else if (rc) { goto finished; + } num_rqst++; trace_smb3_query_info_compound_enter(xid, ses->Suid, tcon->tid, full_path); @@ -239,14 +238,13 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, sizeof(struct smb311_posix_qinfo *) + (PATH_MAX * 2) + (sizeof(struct cifs_sid) * 2), 0, NULL); - if (!rc) { - smb2_set_next_command(tcon, &rqst[num_rqst]); - smb2_set_related(&rqst[num_rqst]); - } } - - if (rc) + if (!rc && (!cfile || num_rqst > 1)) { + smb2_set_next_command(tcon, &rqst[num_rqst]); + smb2_set_related(&rqst[num_rqst]); + } else if (rc) { goto finished; + } num_rqst++; trace_smb3_posix_query_info_compound_enter(xid, ses->Suid, tcon->tid, full_path); @@ -304,13 +302,13 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, FILE_END_OF_FILE_INFORMATION, SMB2_O_INFO_FILE, 0, data, size); - if (!rc) { - smb2_set_next_command(tcon, &rqst[num_rqst]); - smb2_set_related(&rqst[num_rqst]); - } } - if (rc) + if (!rc && (!cfile || num_rqst > 1)) { + smb2_set_next_command(tcon, &rqst[num_rqst]); + smb2_set_related(&rqst[num_rqst]); + } else if (rc) { goto finished; + } num_rqst++; trace_smb3_set_eof_enter(xid, ses->Suid, tcon->tid, full_path); break; @@ -335,14 +333,13 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, COMPOUND_FID, current->tgid, FILE_BASIC_INFORMATION, SMB2_O_INFO_FILE, 0, data, size); - if (!rc) { - smb2_set_next_command(tcon, &rqst[num_rqst]); - smb2_set_related(&rqst[num_rqst]); - } } - - if (rc) + if (!rc && (!cfile || num_rqst > 1)) { + smb2_set_next_command(tcon, &rqst[num_rqst]); + smb2_set_related(&rqst[num_rqst]); + } else if (rc) { goto finished; + } num_rqst++; trace_smb3_set_info_compound_enter(xid, ses->Suid, tcon->tid, full_path); @@ -376,13 +373,13 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, COMPOUND_FID, COMPOUND_FID, current->tgid, FILE_RENAME_INFORMATION, SMB2_O_INFO_FILE, 0, data, size); - if (!rc) { - smb2_set_next_command(tcon, &rqst[num_rqst]); - smb2_set_related(&rqst[num_rqst]); - } } - if (rc) + if (!rc && (!cfile || num_rqst > 1)) { + smb2_set_next_command(tcon, &rqst[num_rqst]); + smb2_set_related(&rqst[num_rqst]); + } else if (rc) { goto finished; + } num_rqst++; trace_smb3_rename_enter(xid, ses->Suid, tcon->tid, full_path); break; @@ -417,15 +414,27 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, rqst[num_rqst].rq_iov = vars->io_iov; rqst[num_rqst].rq_nvec = ARRAY_SIZE(vars->io_iov); - rc = SMB2_ioctl_init(tcon, server, &rqst[num_rqst], - COMPOUND_FID, COMPOUND_FID, - FSCTL_SET_REPARSE_POINT, - in_iov[i].iov_base, - in_iov[i].iov_len, 0); - if (rc) + if (cfile) { + rc = SMB2_ioctl_init(tcon, server, &rqst[num_rqst], + cfile->fid.persistent_fid, + cfile->fid.volatile_fid, + FSCTL_SET_REPARSE_POINT, + in_iov[i].iov_base, + in_iov[i].iov_len, 0); + } else { + rc = SMB2_ioctl_init(tcon, server, &rqst[num_rqst], + COMPOUND_FID, COMPOUND_FID, + FSCTL_SET_REPARSE_POINT, + in_iov[i].iov_base, + in_iov[i].iov_len, 0); + } + if (!rc && (!cfile || num_rqst > 1)) { + smb2_set_next_command(tcon, &rqst[num_rqst]); + smb2_set_related(&rqst[num_rqst]); + } else if (rc) { goto finished; - smb2_set_next_command(tcon, &rqst[num_rqst]); - smb2_set_related(&rqst[num_rqst++]); + } + num_rqst++; trace_smb3_set_reparse_compound_enter(xid, ses->Suid, tcon->tid, full_path); break; @@ -433,14 +442,25 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, rqst[num_rqst].rq_iov = vars->io_iov; rqst[num_rqst].rq_nvec = ARRAY_SIZE(vars->io_iov); - rc = SMB2_ioctl_init(tcon, server, &rqst[num_rqst], - COMPOUND_FID, COMPOUND_FID, - FSCTL_GET_REPARSE_POINT, - NULL, 0, CIFSMaxBufSize); - if (rc) + if (cfile) { + rc = SMB2_ioctl_init(tcon, server, &rqst[num_rqst], + cfile->fid.persistent_fid, + cfile->fid.volatile_fid, + FSCTL_GET_REPARSE_POINT, + NULL, 0, CIFSMaxBufSize); + } else { + rc = SMB2_ioctl_init(tcon, server, &rqst[num_rqst], + COMPOUND_FID, COMPOUND_FID, + FSCTL_GET_REPARSE_POINT, + NULL, 0, CIFSMaxBufSize); + } + if (!rc && (!cfile || num_rqst > 1)) { + smb2_set_next_command(tcon, &rqst[num_rqst]); + smb2_set_related(&rqst[num_rqst]); + } else if (rc) { goto finished; - smb2_set_next_command(tcon, &rqst[num_rqst]); - smb2_set_related(&rqst[num_rqst++]); + } + num_rqst++; trace_smb3_get_reparse_compound_enter(xid, ses->Suid, tcon->tid, full_path); break; From 88b95f351a721d9c585eb7ece6bc4a38c5fdcdfd Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Thu, 25 Jan 2024 19:21:48 -0300 Subject: [PATCH 331/707] smb: client: reduce number of parameters in smb2_compound_op() Replace @desired_access, @create_disposition, @create_options and @mode parameters with a single @oparms. No functional changes. Signed-off-by: Paulo Alcantara Signed-off-by: Steve French --- fs/smb/client/cifsglob.h | 11 ++++ fs/smb/client/smb2inode.c | 125 +++++++++++++++++++++----------------- 2 files changed, 81 insertions(+), 55 deletions(-) diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index a4aa95439e18c6..23db1686f9104f 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -2269,6 +2269,17 @@ static inline void cifs_sg_set_buf(struct sg_table *sgtable, } } +#define CIFS_OPARMS(_cifs_sb, _tcon, _path, _da, _cd, _co, _mode) \ + ((struct cifs_open_parms) { \ + .tcon = _tcon, \ + .path = _path, \ + .desired_access = (_da), \ + .disposition = (_cd), \ + .create_options = cifs_create_options(_cifs_sb, (_co)), \ + .mode = (_mode), \ + .cifs_sb = _cifs_sb, \ + }) + struct smb2_compound_vars { struct cifs_open_parms oparms; struct kvec rsp_iov[MAX_COMPOUND]; diff --git a/fs/smb/client/smb2inode.c b/fs/smb/client/smb2inode.c index 11de1318d8e0f5..93ca39b4be4d29 100644 --- a/fs/smb/client/smb2inode.c +++ b/fs/smb/client/smb2inode.c @@ -95,8 +95,7 @@ static int parse_posix_sids(struct cifs_open_info_data *data, */ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, const char *full_path, - __u32 desired_access, __u32 create_disposition, - __u32 create_options, umode_t mode, struct kvec *in_iov, + struct cifs_open_parms *oparms, struct kvec *in_iov, int *cmds, int num_cmds, struct cifsFileInfo *cfile, struct kvec *out_iov, int *out_buftype) { @@ -152,16 +151,8 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, goto finished; } - vars->oparms = (struct cifs_open_parms) { - .tcon = tcon, - .path = full_path, - .desired_access = desired_access, - .disposition = create_disposition, - .create_options = cifs_create_options(cifs_sb, create_options), - .fid = &fid, - .mode = mode, - .cifs_sb = cifs_sb, - }; + vars->oparms = *oparms; + vars->oparms.fid = &fid; rqst[num_rqst].rq_iov = &vars->open_iov[0]; rqst[num_rqst].rq_nvec = SMB2_CREATE_IOV_SIZE; @@ -713,6 +704,7 @@ int smb2_query_path_info(const unsigned int xid, const char *full_path, struct cifs_open_info_data *data) { + struct cifs_open_parms oparms; __u32 create_options = 0; struct cifsFileInfo *cfile; struct cached_fid *cfid = NULL; @@ -764,10 +756,11 @@ int smb2_query_path_info(const unsigned int xid, in_iov[1] = in_iov[0]; cifs_get_readable_path(tcon, full_path, &cfile); + oparms = CIFS_OPARMS(cifs_sb, tcon, full_path, FILE_READ_ATTRIBUTES, + FILE_OPEN, create_options, ACL_NO_MODE); rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, - FILE_READ_ATTRIBUTES, FILE_OPEN, - create_options, ACL_NO_MODE, in_iov, - cmds, 1, cfile, out_iov, out_buftype); + &oparms, in_iov, cmds, 1, cfile, + out_iov, out_buftype); hdr = out_iov[0].iov_base; /* * If first iov is unset, then SMB session was dropped or we've got a @@ -794,12 +787,14 @@ int smb2_query_path_info(const unsigned int xid, cmds[1] = SMB2_OP_GET_REPARSE; num_cmds = 2; } - create_options |= OPEN_REPARSE_POINT; + oparms = CIFS_OPARMS(cifs_sb, tcon, full_path, + FILE_READ_ATTRIBUTES, FILE_OPEN, + create_options | OPEN_REPARSE_POINT, + ACL_NO_MODE); cifs_get_readable_path(tcon, full_path, &cfile); rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, - FILE_READ_ATTRIBUTES, FILE_OPEN, - create_options, ACL_NO_MODE, in_iov, - cmds, num_cmds, cfile, NULL, NULL); + &oparms, in_iov, cmds, num_cmds, + cfile, NULL, NULL); break; case -EREMOTE: break; @@ -827,10 +822,13 @@ smb2_mkdir(const unsigned int xid, struct inode *parent_inode, umode_t mode, struct cifs_tcon *tcon, const char *name, struct cifs_sb_info *cifs_sb) { - return smb2_compound_op(xid, tcon, cifs_sb, name, - FILE_WRITE_ATTRIBUTES, FILE_CREATE, - CREATE_NOT_FILE, mode, - NULL, &(int){SMB2_OP_MKDIR}, 1, + struct cifs_open_parms oparms; + + oparms = CIFS_OPARMS(cifs_sb, tcon, name, FILE_WRITE_ATTRIBUTES, + FILE_CREATE, CREATE_NOT_FILE, mode); + return smb2_compound_op(xid, tcon, cifs_sb, + name, &oparms, NULL, + &(int){SMB2_OP_MKDIR}, 1, NULL, NULL, NULL); } @@ -839,6 +837,7 @@ smb2_mkdir_setinfo(struct inode *inode, const char *name, struct cifs_sb_info *cifs_sb, struct cifs_tcon *tcon, const unsigned int xid) { + struct cifs_open_parms oparms; FILE_BASIC_INFO data = {}; struct cifsInodeInfo *cifs_i; struct cifsFileInfo *cfile; @@ -852,9 +851,10 @@ smb2_mkdir_setinfo(struct inode *inode, const char *name, dosattrs = cifs_i->cifsAttrs | ATTR_READONLY; data.Attributes = cpu_to_le32(dosattrs); cifs_get_writable_path(tcon, name, FIND_WR_ANY, &cfile); + oparms = CIFS_OPARMS(cifs_sb, tcon, name, FILE_WRITE_ATTRIBUTES, + FILE_CREATE, CREATE_NOT_FILE, ACL_NO_MODE); tmprc = smb2_compound_op(xid, tcon, cifs_sb, name, - FILE_WRITE_ATTRIBUTES, FILE_CREATE, - CREATE_NOT_FILE, ACL_NO_MODE, &in_iov, + &oparms, &in_iov, &(int){SMB2_OP_SET_INFO}, 1, cfile, NULL, NULL); if (tmprc == 0) @@ -865,10 +865,13 @@ int smb2_rmdir(const unsigned int xid, struct cifs_tcon *tcon, const char *name, struct cifs_sb_info *cifs_sb) { + struct cifs_open_parms oparms; + drop_cached_dir_by_name(xid, tcon, name, cifs_sb); - return smb2_compound_op(xid, tcon, cifs_sb, name, - DELETE, FILE_OPEN, CREATE_NOT_FILE, - ACL_NO_MODE, NULL, + oparms = CIFS_OPARMS(cifs_sb, tcon, name, DELETE, + FILE_OPEN, CREATE_NOT_FILE, ACL_NO_MODE); + return smb2_compound_op(xid, tcon, cifs_sb, + name, &oparms, NULL, &(int){SMB2_OP_RMDIR}, 1, NULL, NULL, NULL); } @@ -877,10 +880,14 @@ int smb2_unlink(const unsigned int xid, struct cifs_tcon *tcon, const char *name, struct cifs_sb_info *cifs_sb) { - return smb2_compound_op(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN, - CREATE_DELETE_ON_CLOSE | OPEN_REPARSE_POINT, - ACL_NO_MODE, NULL, - &(int){SMB2_OP_DELETE}, 1, + struct cifs_open_parms oparms; + + oparms = CIFS_OPARMS(cifs_sb, tcon, name, + DELETE, FILE_OPEN, + CREATE_DELETE_ON_CLOSE | OPEN_REPARSE_POINT, + ACL_NO_MODE); + return smb2_compound_op(xid, tcon, cifs_sb, name, &oparms, + NULL, &(int){SMB2_OP_DELETE}, 1, NULL, NULL, NULL); } @@ -890,6 +897,7 @@ static int smb2_set_path_attr(const unsigned int xid, struct cifs_tcon *tcon, __u32 create_options, __u32 access, int command, struct cifsFileInfo *cfile) { + struct cifs_open_parms oparms; struct kvec in_iov; __le16 *smb2_to_name = NULL; int rc; @@ -901,8 +909,9 @@ static int smb2_set_path_attr(const unsigned int xid, struct cifs_tcon *tcon, } in_iov.iov_base = smb2_to_name; in_iov.iov_len = 2 * UniStrnlen((wchar_t *)smb2_to_name, PATH_MAX); - rc = smb2_compound_op(xid, tcon, cifs_sb, from_name, access, - FILE_OPEN, create_options, ACL_NO_MODE, + oparms = CIFS_OPARMS(cifs_sb, tcon, from_name, access, FILE_OPEN, + create_options, ACL_NO_MODE); + rc = smb2_compound_op(xid, tcon, cifs_sb, from_name, &oparms, &in_iov, &command, 1, cfile, NULL, NULL); smb2_rename_path: kfree(smb2_to_name); @@ -943,6 +952,7 @@ smb2_set_path_size(const unsigned int xid, struct cifs_tcon *tcon, const char *full_path, __u64 size, struct cifs_sb_info *cifs_sb, bool set_alloc) { + struct cifs_open_parms oparms; struct cifsFileInfo *cfile; struct kvec in_iov; __le64 eof = cpu_to_le64(size); @@ -950,9 +960,10 @@ smb2_set_path_size(const unsigned int xid, struct cifs_tcon *tcon, in_iov.iov_base = &eof; in_iov.iov_len = sizeof(eof); cifs_get_writable_path(tcon, full_path, FIND_WR_ANY, &cfile); - return smb2_compound_op(xid, tcon, cifs_sb, full_path, - FILE_WRITE_DATA, FILE_OPEN, - 0, ACL_NO_MODE, &in_iov, + oparms = CIFS_OPARMS(cifs_sb, tcon, full_path, FILE_WRITE_DATA, + FILE_OPEN, 0, ACL_NO_MODE); + return smb2_compound_op(xid, tcon, cifs_sb, + full_path, &oparms, &in_iov, &(int){SMB2_OP_SET_EOF}, 1, cfile, NULL, NULL); } @@ -961,6 +972,7 @@ int smb2_set_file_info(struct inode *inode, const char *full_path, FILE_BASIC_INFO *buf, const unsigned int xid) { + struct cifs_open_parms oparms; struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct tcon_link *tlink; struct cifs_tcon *tcon; @@ -979,9 +991,10 @@ smb2_set_file_info(struct inode *inode, const char *full_path, tcon = tlink_tcon(tlink); cifs_get_writable_path(tcon, full_path, FIND_WR_ANY, &cfile); - rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, - FILE_WRITE_ATTRIBUTES, FILE_OPEN, - 0, ACL_NO_MODE, &in_iov, + oparms = CIFS_OPARMS(cifs_sb, tcon, full_path, FILE_WRITE_ATTRIBUTES, + FILE_OPEN, 0, ACL_NO_MODE); + rc = smb2_compound_op(xid, tcon, cifs_sb, + full_path, &oparms, &in_iov, &(int){SMB2_OP_SET_INFO}, 1, cfile, NULL, NULL); cifs_put_tlink(tlink); @@ -995,19 +1008,21 @@ struct inode *smb2_get_reparse_inode(struct cifs_open_info_data *data, const char *full_path, struct kvec *iov) { + struct cifs_open_parms oparms; struct cifs_sb_info *cifs_sb = CIFS_SB(sb); struct cifsFileInfo *cfile; struct inode *new = NULL; struct kvec in_iov[2]; int cmds[2]; - int da, co, cd; int rc; - da = SYNCHRONIZE | DELETE | - FILE_READ_ATTRIBUTES | - FILE_WRITE_ATTRIBUTES; - co = CREATE_NOT_DIR | OPEN_REPARSE_POINT; - cd = FILE_CREATE; + oparms = CIFS_OPARMS(cifs_sb, tcon, full_path, + SYNCHRONIZE | DELETE | + FILE_READ_ATTRIBUTES | + FILE_WRITE_ATTRIBUTES, + FILE_CREATE, + CREATE_NOT_DIR | OPEN_REPARSE_POINT, + ACL_NO_MODE); cmds[0] = SMB2_OP_SET_REPARSE; in_iov[0] = *iov; in_iov[1].iov_base = data; @@ -1016,9 +1031,8 @@ struct inode *smb2_get_reparse_inode(struct cifs_open_info_data *data, if (tcon->posix_extensions) { cmds[1] = SMB2_OP_POSIX_QUERY_INFO; cifs_get_writable_path(tcon, full_path, FIND_WR_ANY, &cfile); - rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, - da, cd, co, ACL_NO_MODE, in_iov, - cmds, 2, cfile, NULL, NULL); + rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, &oparms, + in_iov, cmds, 2, cfile, NULL, NULL); if (!rc) { rc = smb311_posix_get_inode_info(&new, full_path, data, sb, xid); @@ -1026,9 +1040,8 @@ struct inode *smb2_get_reparse_inode(struct cifs_open_info_data *data, } else { cmds[1] = SMB2_OP_QUERY_INFO; cifs_get_writable_path(tcon, full_path, FIND_WR_ANY, &cfile); - rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, - da, cd, co, ACL_NO_MODE, in_iov, - cmds, 2, cfile, NULL, NULL); + rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, &oparms, + in_iov, cmds, 2, cfile, NULL, NULL); if (!rc) { rc = cifs_get_inode_info(&new, full_path, data, sb, xid, NULL); @@ -1044,6 +1057,7 @@ int smb2_query_reparse_point(const unsigned int xid, u32 *tag, struct kvec *rsp, int *rsp_buftype) { + struct cifs_open_parms oparms; struct cifs_open_info_data data = {}; struct cifsFileInfo *cfile; struct kvec in_iov = { .iov_base = &data, .iov_len = sizeof(data), }; @@ -1052,9 +1066,10 @@ int smb2_query_reparse_point(const unsigned int xid, cifs_dbg(FYI, "%s: path: %s\n", __func__, full_path); cifs_get_readable_path(tcon, full_path, &cfile); - rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, - FILE_READ_ATTRIBUTES, FILE_OPEN, - OPEN_REPARSE_POINT, ACL_NO_MODE, &in_iov, + oparms = CIFS_OPARMS(cifs_sb, tcon, full_path, FILE_READ_ATTRIBUTES, + FILE_OPEN, OPEN_REPARSE_POINT, ACL_NO_MODE); + rc = smb2_compound_op(xid, tcon, cifs_sb, + full_path, &oparms, &in_iov, &(int){SMB2_OP_GET_REPARSE}, 1, cfile, NULL, NULL); if (rc) From 3815959fe70011dea1589df4f90623ade113c5ad Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Fri, 26 Jan 2024 19:26:06 -0300 Subject: [PATCH 332/707] smb: client: add support for WSL reparse points Add support for creating special files via WSL reparse points when using 'reparse=wsl' mount option. They're faster than NFS reparse points because they don't require extra roundtrips to figure out what ->d_type a specific dirent is as such information is already stored in query dir responses and then making getdents() calls faster. Signed-off-by: Paulo Alcantara Signed-off-by: Steve French --- fs/smb/client/cifsglob.h | 1 + fs/smb/client/fs_context.c | 4 +- fs/smb/client/reparse.c | 170 +++++++++++++++++++++++++++++++++++-- fs/smb/client/reparse.h | 13 ++- fs/smb/client/smb2inode.c | 8 +- fs/smb/client/smb2ops.c | 2 +- fs/smb/client/smb2pdu.c | 12 +++ fs/smb/client/smb2pdu.h | 11 ++- fs/smb/client/smb2proto.h | 3 +- fs/smb/common/smbfsctl.h | 6 -- 10 files changed, 210 insertions(+), 20 deletions(-) diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index 23db1686f9104f..90bc90ed97ef66 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -1371,6 +1371,7 @@ struct cifs_open_parms { struct cifs_fid *fid; umode_t mode; bool reconnect:1; + struct kvec *ea_cctx; }; struct cifs_fid { diff --git a/fs/smb/client/fs_context.c b/fs/smb/client/fs_context.c index aee8be2cae4676..82eafe0815dc52 100644 --- a/fs/smb/client/fs_context.c +++ b/fs/smb/client/fs_context.c @@ -317,8 +317,8 @@ static int parse_reparse_flavor(struct fs_context *fc, char *value, ctx->reparse_type = CIFS_REPARSE_TYPE_NFS; break; case Opt_reparse_wsl: - cifs_errorf(fc, "unsupported reparse= option: %s\n", value); - return 1; + ctx->reparse_type = CIFS_REPARSE_TYPE_WSL; + break; default: cifs_errorf(fc, "bad reparse= option: %s\n", value); return 1; diff --git a/fs/smb/client/reparse.c b/fs/smb/client/reparse.c index 5ce9d1ed5b8881..23e0cb62552ec7 100644 --- a/fs/smb/client/reparse.c +++ b/fs/smb/client/reparse.c @@ -11,6 +11,7 @@ #include "cifsproto.h" #include "cifs_unicode.h" #include "cifs_debug.h" +#include "fs_context.h" #include "reparse.h" int smb2_create_reparse_symlink(const unsigned int xid, struct inode *inode, @@ -64,7 +65,7 @@ int smb2_create_reparse_symlink(const unsigned int xid, struct inode *inode, iov.iov_base = buf; iov.iov_len = len; new = smb2_get_reparse_inode(&data, inode->i_sb, xid, - tcon, full_path, &iov); + tcon, full_path, &iov, NULL); if (!IS_ERR(new)) d_instantiate(dentry, new); else @@ -110,9 +111,9 @@ static int nfs_set_reparse_buf(struct reparse_posix_data *buf, return 0; } -int smb2_make_nfs_node(unsigned int xid, struct inode *inode, - struct dentry *dentry, struct cifs_tcon *tcon, - const char *full_path, umode_t mode, dev_t dev) +static int mknod_nfs(unsigned int xid, struct inode *inode, + struct dentry *dentry, struct cifs_tcon *tcon, + const char *full_path, umode_t mode, dev_t dev) { struct cifs_open_info_data data; struct reparse_posix_data *p; @@ -132,12 +133,171 @@ int smb2_make_nfs_node(unsigned int xid, struct inode *inode, }; new = smb2_get_reparse_inode(&data, inode->i_sb, xid, - tcon, full_path, &iov); + tcon, full_path, &iov, NULL); + if (!IS_ERR(new)) + d_instantiate(dentry, new); + else + rc = PTR_ERR(new); + cifs_free_open_info(&data); + return rc; +} + +static int wsl_set_reparse_buf(struct reparse_data_buffer *buf, + mode_t mode, struct kvec *iov) +{ + u32 tag; + + switch ((tag = reparse_mode_wsl_tag(mode))) { + case IO_REPARSE_TAG_LX_BLK: + case IO_REPARSE_TAG_LX_CHR: + case IO_REPARSE_TAG_LX_FIFO: + case IO_REPARSE_TAG_AF_UNIX: + break; + default: + return -EOPNOTSUPP; + } + + buf->ReparseTag = cpu_to_le32(tag); + buf->Reserved = 0; + buf->ReparseDataLength = 0; + iov->iov_base = buf; + iov->iov_len = sizeof(*buf); + return 0; +} + +static struct smb2_create_ea_ctx *ea_create_context(u32 dlen, size_t *cc_len) +{ + struct smb2_create_ea_ctx *cc; + + *cc_len = round_up(sizeof(*cc) + dlen, 8); + cc = kzalloc(*cc_len, GFP_KERNEL); + if (!cc) + return ERR_PTR(-ENOMEM); + + cc->ctx.NameOffset = cpu_to_le16(offsetof(struct smb2_create_ea_ctx, + name)); + cc->ctx.NameLength = cpu_to_le16(4); + memcpy(cc->name, SMB2_CREATE_EA_BUFFER, strlen(SMB2_CREATE_EA_BUFFER)); + cc->ctx.DataOffset = cpu_to_le16(offsetof(struct smb2_create_ea_ctx, ea)); + cc->ctx.DataLength = cpu_to_le32(dlen); + return cc; +} + +struct wsl_xattr { + const char *name; + __le64 value; + u16 size; + u32 next; +}; + +static int wsl_set_xattrs(struct inode *inode, umode_t _mode, + dev_t _dev, struct kvec *iov) +{ + struct smb2_file_full_ea_info *ea; + struct smb2_create_ea_ctx *cc; + struct smb3_fs_context *ctx = CIFS_SB(inode->i_sb)->ctx; + __le64 uid = cpu_to_le64(from_kuid(current_user_ns(), ctx->linux_uid)); + __le64 gid = cpu_to_le64(from_kgid(current_user_ns(), ctx->linux_gid)); + __le64 dev = cpu_to_le64(((u64)MINOR(_dev) << 32) | MAJOR(_dev)); + __le64 mode = cpu_to_le64(_mode); + struct wsl_xattr xattrs[] = { + { .name = "$LXUID", .value = uid, .size = 4, }, + { .name = "$LXGID", .value = gid, .size = 4, }, + { .name = "$LXMOD", .value = mode, .size = 4, }, + { .name = "$LXDEV", .value = dev, .size = 8, }, + }; + size_t cc_len; + u32 dlen = 0, next = 0; + int i, num_xattrs; + u8 name_size = strlen(xattrs[0].name) + 1; + + memset(iov, 0, sizeof(*iov)); + + /* Exclude $LXDEV xattr for sockets and fifos */ + if (S_ISSOCK(_mode) || S_ISFIFO(_mode)) + num_xattrs = ARRAY_SIZE(xattrs) - 1; + else + num_xattrs = ARRAY_SIZE(xattrs); + + for (i = 0; i < num_xattrs; i++) { + xattrs[i].next = ALIGN(sizeof(*ea) + name_size + + xattrs[i].size, 4); + dlen += xattrs[i].next; + } + + cc = ea_create_context(dlen, &cc_len); + if (!cc) + return PTR_ERR(cc); + + ea = &cc->ea; + for (i = 0; i < num_xattrs; i++) { + ea = (void *)((u8 *)ea + next); + next = xattrs[i].next; + ea->next_entry_offset = cpu_to_le32(next); + + ea->ea_name_length = name_size - 1; + ea->ea_value_length = cpu_to_le16(xattrs[i].size); + memcpy(ea->ea_data, xattrs[i].name, name_size); + memcpy(&ea->ea_data[name_size], + &xattrs[i].value, xattrs[i].size); + } + ea->next_entry_offset = 0; + + iov->iov_base = cc; + iov->iov_len = cc_len; + return 0; +} + +static int mknod_wsl(unsigned int xid, struct inode *inode, + struct dentry *dentry, struct cifs_tcon *tcon, + const char *full_path, umode_t mode, dev_t dev) +{ + struct cifs_open_info_data data; + struct reparse_data_buffer buf; + struct inode *new; + struct kvec reparse_iov, xattr_iov; + int rc; + + rc = wsl_set_reparse_buf(&buf, mode, &reparse_iov); + if (rc) + return rc; + + rc = wsl_set_xattrs(inode, mode, dev, &xattr_iov); + if (rc) + return rc; + + data = (struct cifs_open_info_data) { + .reparse_point = true, + .reparse = { .tag = le32_to_cpu(buf.ReparseTag), .buf = &buf, }, + }; + + new = smb2_get_reparse_inode(&data, inode->i_sb, + xid, tcon, full_path, + &reparse_iov, &xattr_iov); if (!IS_ERR(new)) d_instantiate(dentry, new); else rc = PTR_ERR(new); cifs_free_open_info(&data); + kfree(xattr_iov.iov_base); + return rc; +} + +int smb2_mknod_reparse(unsigned int xid, struct inode *inode, + struct dentry *dentry, struct cifs_tcon *tcon, + const char *full_path, umode_t mode, dev_t dev) +{ + struct smb3_fs_context *ctx = CIFS_SB(inode->i_sb)->ctx; + int rc = -EOPNOTSUPP; + + switch (ctx->reparse_type) { + case CIFS_REPARSE_TYPE_NFS: + rc = mknod_nfs(xid, inode, dentry, tcon, full_path, mode, dev); + break; + case CIFS_REPARSE_TYPE_WSL: + rc = mknod_wsl(xid, inode, dentry, tcon, full_path, mode, dev); + break; + } return rc; } diff --git a/fs/smb/client/reparse.h b/fs/smb/client/reparse.h index 3ceb90da0df901..9816bac9855257 100644 --- a/fs/smb/client/reparse.h +++ b/fs/smb/client/reparse.h @@ -28,6 +28,17 @@ static inline u64 reparse_mode_nfs_type(mode_t mode) return 0; } +static inline u32 reparse_mode_wsl_tag(mode_t mode) +{ + switch (mode & S_IFMT) { + case S_IFBLK: return IO_REPARSE_TAG_LX_BLK; + case S_IFCHR: return IO_REPARSE_TAG_LX_CHR; + case S_IFIFO: return IO_REPARSE_TAG_LX_FIFO; + case S_IFSOCK: return IO_REPARSE_TAG_AF_UNIX; + } + return 0; +} + /* * Match a reparse point inode if reparse tag and ctime haven't changed. * @@ -64,7 +75,7 @@ bool cifs_reparse_point_to_fattr(struct cifs_sb_info *cifs_sb, int smb2_create_reparse_symlink(const unsigned int xid, struct inode *inode, struct dentry *dentry, struct cifs_tcon *tcon, const char *full_path, const char *symname); -int smb2_make_nfs_node(unsigned int xid, struct inode *inode, +int smb2_mknod_reparse(unsigned int xid, struct inode *inode, struct dentry *dentry, struct cifs_tcon *tcon, const char *full_path, umode_t mode, dev_t dev); int smb2_parse_reparse_point(struct cifs_sb_info *cifs_sb, struct kvec *rsp_iov, diff --git a/fs/smb/client/smb2inode.c b/fs/smb/client/smb2inode.c index 93ca39b4be4d29..e9955b964ea470 100644 --- a/fs/smb/client/smb2inode.c +++ b/fs/smb/client/smb2inode.c @@ -1006,7 +1006,8 @@ struct inode *smb2_get_reparse_inode(struct cifs_open_info_data *data, const unsigned int xid, struct cifs_tcon *tcon, const char *full_path, - struct kvec *iov) + struct kvec *reparse_iov, + struct kvec *xattr_iov) { struct cifs_open_parms oparms; struct cifs_sb_info *cifs_sb = CIFS_SB(sb); @@ -1023,8 +1024,11 @@ struct inode *smb2_get_reparse_inode(struct cifs_open_info_data *data, FILE_CREATE, CREATE_NOT_DIR | OPEN_REPARSE_POINT, ACL_NO_MODE); + if (xattr_iov) + oparms.ea_cctx = xattr_iov; + cmds[0] = SMB2_OP_SET_REPARSE; - in_iov[0] = *iov; + in_iov[0] = *reparse_iov; in_iov[1].iov_base = data; in_iov[1].iov_len = sizeof(*data); diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index aee0cbf01e6cdc..708973ec750b8f 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -5039,7 +5039,7 @@ static int smb2_make_node(unsigned int xid, struct inode *inode, rc = cifs_sfu_make_node(xid, inode, dentry, tcon, full_path, mode, dev); } else { - rc = smb2_make_nfs_node(xid, inode, dentry, tcon, + rc = smb2_mknod_reparse(xid, inode, dentry, tcon, full_path, mode, dev); } return rc; diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index 273e24f9da1316..2837fc4465a7ff 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -2707,6 +2707,17 @@ add_query_id_context(struct kvec *iov, unsigned int *num_iovec) return 0; } +static void add_ea_context(struct cifs_open_parms *oparms, + struct kvec *rq_iov, unsigned int *num_iovs) +{ + struct kvec *iov = oparms->ea_cctx; + + if (iov && iov->iov_base && iov->iov_len) { + rq_iov[(*num_iovs)++] = *iov; + memset(iov, 0, sizeof(*iov)); + } +} + static int alloc_path_with_tree_prefix(__le16 **out_path, int *out_size, int *out_len, const char *treename, const __le16 *path) @@ -3073,6 +3084,7 @@ SMB2_open_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server, } add_query_id_context(iov, &n_iov); + add_ea_context(oparms, iov, &n_iov); if (n_iov > 2) { /* diff --git a/fs/smb/client/smb2pdu.h b/fs/smb/client/smb2pdu.h index db08194484e06c..ea63d33e455322 100644 --- a/fs/smb/client/smb2pdu.h +++ b/fs/smb/client/smb2pdu.h @@ -117,9 +117,10 @@ struct share_redirect_error_context_rsp { * [4] : posix context * [5] : time warp context * [6] : query id context - * [7] : compound padding + * [7] : create ea context + * [8] : compound padding */ -#define SMB2_CREATE_IOV_SIZE 8 +#define SMB2_CREATE_IOV_SIZE 9 /* * Maximum size of a SMB2_CREATE response is 64 (smb2 header) + @@ -413,4 +414,10 @@ struct smb2_posix_info_parsed { const u8 *name; }; +struct smb2_create_ea_ctx { + struct create_context ctx; + __u8 name[8]; + struct smb2_file_full_ea_info ea; +} __packed; + #endif /* _SMB2PDU_H */ diff --git a/fs/smb/client/smb2proto.h b/fs/smb/client/smb2proto.h index 5322b769bca21c..d910d38671ee86 100644 --- a/fs/smb/client/smb2proto.h +++ b/fs/smb/client/smb2proto.h @@ -61,7 +61,8 @@ struct inode *smb2_get_reparse_inode(struct cifs_open_info_data *data, const unsigned int xid, struct cifs_tcon *tcon, const char *full_path, - struct kvec *iov); + struct kvec *reparse_iov, + struct kvec *xattr_iov); int smb2_query_reparse_point(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, diff --git a/fs/smb/common/smbfsctl.h b/fs/smb/common/smbfsctl.h index edd7fc2a7921b8..a94d658b88e86b 100644 --- a/fs/smb/common/smbfsctl.h +++ b/fs/smb/common/smbfsctl.h @@ -158,12 +158,6 @@ #define IO_REPARSE_TAG_LX_CHR 0x80000025 #define IO_REPARSE_TAG_LX_BLK 0x80000026 -#define IO_REPARSE_TAG_LX_SYMLINK_LE cpu_to_le32(0xA000001D) -#define IO_REPARSE_TAG_AF_UNIX_LE cpu_to_le32(0x80000023) -#define IO_REPARSE_TAG_LX_FIFO_LE cpu_to_le32(0x80000024) -#define IO_REPARSE_TAG_LX_CHR_LE cpu_to_le32(0x80000025) -#define IO_REPARSE_TAG_LX_BLK_LE cpu_to_le32(0x80000026) - /* fsctl flags */ /* If Flags is set to this value, the request is an FSCTL not ioctl request */ #define SMB2_0_IOCTL_IS_FSCTL 0x00000001 From 27931312fd50069dcad71e9660a84417b36224b3 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Mon, 10 Apr 2023 21:04:50 +0900 Subject: [PATCH 333/707] sysv: don't call sb_bread() with pointers_lock held syzbot is reporting sleep in atomic context in SysV filesystem [1], for sb_bread() is called with rw_spinlock held. A "write_lock(&pointers_lock) => read_lock(&pointers_lock) deadlock" bug and a "sb_bread() with write_lock(&pointers_lock)" bug were introduced by "Replace BKL for chain locking with sysvfs-private rwlock" in Linux 2.5.12. Then, "[PATCH] err1-40: sysvfs locking fix" in Linux 2.6.8 fixed the former bug by moving pointers_lock lock to the callers, but instead introduced a "sb_bread() with read_lock(&pointers_lock)" bug (which made this problem easier to hit). Al Viro suggested that why not to do like get_branch()/get_block()/ find_shared() in Minix filesystem does. And doing like that is almost a revert of "[PATCH] err1-40: sysvfs locking fix" except that get_branch() from with find_shared() is called without write_lock(&pointers_lock). Reported-by: syzbot Link: https://syzkaller.appspot.com/bug?extid=69b40dc5fd40f32c199f Suggested-by: Al Viro Signed-off-by: Tetsuo Handa Link: https://lore.kernel.org/r/0d195f93-a22a-49a2-0020-103534d6f7f6@I-love.SAKURA.ne.jp Signed-off-by: Christian Brauner --- fs/sysv/itree.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c index 410ab2a44d2f60..19bcb51a220366 100644 --- a/fs/sysv/itree.c +++ b/fs/sysv/itree.c @@ -83,9 +83,6 @@ static inline sysv_zone_t *block_end(struct buffer_head *bh) return (sysv_zone_t*)((char*)bh->b_data + bh->b_size); } -/* - * Requires read_lock(&pointers_lock) or write_lock(&pointers_lock) - */ static Indirect *get_branch(struct inode *inode, int depth, int offsets[], @@ -105,15 +102,18 @@ static Indirect *get_branch(struct inode *inode, bh = sb_bread(sb, block); if (!bh) goto failure; + read_lock(&pointers_lock); if (!verify_chain(chain, p)) goto changed; add_chain(++p, bh, (sysv_zone_t*)bh->b_data + *++offsets); + read_unlock(&pointers_lock); if (!p->key) goto no_block; } return NULL; changed: + read_unlock(&pointers_lock); brelse(bh); *err = -EAGAIN; goto no_block; @@ -219,9 +219,7 @@ static int get_block(struct inode *inode, sector_t iblock, struct buffer_head *b goto out; reread: - read_lock(&pointers_lock); partial = get_branch(inode, depth, offsets, chain, &err); - read_unlock(&pointers_lock); /* Simplest case - block found, no allocation needed */ if (!partial) { @@ -291,9 +289,9 @@ static Indirect *find_shared(struct inode *inode, *top = 0; for (k = depth; k > 1 && !offsets[k-1]; k--) ; + partial = get_branch(inode, k, offsets, chain, &err); write_lock(&pointers_lock); - partial = get_branch(inode, k, offsets, chain, &err); if (!partial) partial = chain + k-1; /* From 994fe04908b762031eaadd6bcaf343e85f3cf163 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Mon, 29 Jan 2024 19:00:23 +0100 Subject: [PATCH 334/707] ntfs3: use file_mnt_idmap helper Let's use file_mnt_idmap() as we do that across the tree. No functional impact. Cc: Christian Brauner Cc: Konstantin Komarov Cc: Cc: Cc: Signed-off-by: Alexander Mikhalitsyn Link: https://lore.kernel.org/r/20240129180024.219766-1-aleksandr.mikhalitsyn@canonical.com Signed-off-by: Christian Brauner --- fs/ntfs3/namei.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ntfs3/namei.c b/fs/ntfs3/namei.c index ee3093be51701e..144aa80cca433e 100644 --- a/fs/ntfs3/namei.c +++ b/fs/ntfs3/namei.c @@ -419,7 +419,7 @@ static int ntfs_atomic_open(struct inode *dir, struct dentry *dentry, * fnd contains tree's path to insert to. * If fnd is not NULL then dir is locked. */ - inode = ntfs_create_inode(mnt_idmap(file->f_path.mnt), dir, dentry, uni, + inode = ntfs_create_inode(file_mnt_idmap(file), dir, dentry, uni, mode, 0, NULL, 0, fnd); err = IS_ERR(inode) ? PTR_ERR(inode) : finish_open(file, dentry, ntfs_file_open); From 563bd99dc1910e62b4d99eb1906bc8858acf483b Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 29 Jan 2024 10:37:29 -0800 Subject: [PATCH 335/707] iov_iter: Avoid wrap-around instrumentation in copy_compat_iovec_from_user() The loop counter "i" in copy_compat_iovec_from_user() is an int, but because the nr_segs argument is unsigned long, the signed overflow sanitizer got worried "i" could wrap around. Instead of making "i" an unsigned long (which may enlarge the type size), switch both nr_segs and i to u32. There is no truncation with nr_segs since it is never larger than UIO_MAXIOV anyway. This keeps sanitizer instrumentation[1] out of a UACCESS path: vmlinux.o: warning: objtool: copy_compat_iovec_from_user+0xa9: call to __ubsan_handle_add_overflow() with UACCESS enabled Link: https://github.com/KSPP/linux/issues/26 [1] Cc: Christian Brauner Cc: Alexander Viro Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20240129183729.work.991-kees@kernel.org Signed-off-by: Christian Brauner --- lib/iov_iter.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/lib/iov_iter.c b/lib/iov_iter.c index 15f5040709c36e..73715d10c812bf 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -1167,11 +1167,12 @@ const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags) EXPORT_SYMBOL(dup_iter); static __noclone int copy_compat_iovec_from_user(struct iovec *iov, - const struct iovec __user *uvec, unsigned long nr_segs) + const struct iovec __user *uvec, u32 nr_segs) { const struct compat_iovec __user *uiov = (const struct compat_iovec __user *)uvec; - int ret = -EFAULT, i; + int ret = -EFAULT; + u32 i; if (!user_access_begin(uiov, nr_segs * sizeof(*uiov))) return -EFAULT; From ee393cff29bd46cbf469abbfb795857ffc96b98b Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 29 Jan 2024 10:40:15 -0800 Subject: [PATCH 336/707] select: Avoid wrap-around instrumentation in do_sys_poll() The mix of int, unsigned int, and unsigned long used by struct poll_list::len, todo, len, and j meant that the signed overflow sanitizer got worried it needed to instrument several places where arithmetic happens between these variables. Since all of the variables are always positive and bounded by unsigned int, use a single type in all places. Additionally expand the zero-test into an explicit range check before updating "todo". This keeps sanitizer instrumentation[1] out of a UACCESS path: vmlinux.o: warning: objtool: do_sys_poll+0x285: call to __ubsan_handle_sub_overflow() with UACCESS enabled Link: https://github.com/KSPP/linux/issues/26 [1] Cc: Christian Brauner Cc: Alexander Viro Cc: Jan Kara Cc: Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20240129184014.work.593-kees@kernel.org Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/select.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/fs/select.c b/fs/select.c index 0ee55af1a55c29..11a3b1312abeff 100644 --- a/fs/select.c +++ b/fs/select.c @@ -839,7 +839,7 @@ SYSCALL_DEFINE1(old_select, struct sel_arg_struct __user *, arg) struct poll_list { struct poll_list *next; - int len; + unsigned int len; struct pollfd entries[]; }; @@ -975,14 +975,15 @@ static int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds, struct timespec64 *end_time) { struct poll_wqueues table; - int err = -EFAULT, fdcount, len; + int err = -EFAULT, fdcount; /* Allocate small arguments on the stack to save memory and be faster - use long to make sure the buffer is aligned properly on 64 bit archs to avoid unaligned access */ long stack_pps[POLL_STACK_ALLOC/sizeof(long)]; struct poll_list *const head = (struct poll_list *)stack_pps; struct poll_list *walk = head; - unsigned long todo = nfds; + unsigned int todo = nfds; + unsigned int len; if (nfds > rlimit(RLIMIT_NOFILE)) return -EINVAL; @@ -998,9 +999,9 @@ static int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds, sizeof(struct pollfd) * walk->len)) goto out_fds; - todo -= walk->len; - if (!todo) + if (walk->len >= todo) break; + todo -= walk->len; len = min(todo, POLLFD_PER_PAGE); walk = walk->next = kmalloc(struct_size(walk, entries, len), @@ -1020,7 +1021,7 @@ static int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds, for (walk = head; walk; walk = walk->next) { struct pollfd *fds = walk->entries; - int j; + unsigned int j; for (j = walk->len; j; fds++, ufds++, j--) unsafe_put_user(fds->revents, &ufds->revents, Efault); From 21a3b988053a05e0c397ce407fa5fb01550c6e0d Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 14 Jan 2024 16:01:57 +0100 Subject: [PATCH 337/707] mmc: core: Remove usage of the deprecated ida_simple_xx() API ida_alloc() and ida_free() should be preferred to the deprecated ida_simple_get() and ida_simple_remove(). Note that the upper limit of ida_simple_get() is exclusive, but the one of ida_alloc_range()/ida_alloc_max() is inclusive. So a -1 has been added when needed. Signed-off-by: Christophe JAILLET Link: https://lore.kernel.org/r/583c57d0ae09f9d3a1e1a7b80c1e39ada17954b7.1705244502.git.christophe.jaillet@wanadoo.fr Signed-off-by: Ulf Hansson --- drivers/mmc/core/block.c | 12 ++++++------ drivers/mmc/core/host.c | 5 +++-- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c index 32d49100dff516..a9b60b91e32f6d 100644 --- a/drivers/mmc/core/block.c +++ b/drivers/mmc/core/block.c @@ -206,7 +206,7 @@ static void mmc_blk_kref_release(struct kref *ref) int devidx; devidx = mmc_get_devidx(md->disk); - ida_simple_remove(&mmc_blk_ida, devidx); + ida_free(&mmc_blk_ida, devidx); mutex_lock(&open_lock); md->disk->private_data = NULL; @@ -2467,7 +2467,7 @@ static struct mmc_blk_data *mmc_blk_alloc_req(struct mmc_card *card, bool cache_enabled = false; bool fua_enabled = false; - devidx = ida_simple_get(&mmc_blk_ida, 0, max_devices, GFP_KERNEL); + devidx = ida_alloc_max(&mmc_blk_ida, max_devices - 1, GFP_KERNEL); if (devidx < 0) { /* * We get -ENOSPC because there are no more any available @@ -2577,7 +2577,7 @@ static struct mmc_blk_data *mmc_blk_alloc_req(struct mmc_card *card, err_kfree: kfree(md); out: - ida_simple_remove(&mmc_blk_ida, devidx); + ida_free(&mmc_blk_ida, devidx); return ERR_PTR(ret); } @@ -2703,7 +2703,7 @@ static void mmc_blk_rpmb_device_release(struct device *dev) { struct mmc_rpmb_data *rpmb = dev_get_drvdata(dev); - ida_simple_remove(&mmc_rpmb_ida, rpmb->id); + ida_free(&mmc_rpmb_ida, rpmb->id); kfree(rpmb); } @@ -2719,13 +2719,13 @@ static int mmc_blk_alloc_rpmb_part(struct mmc_card *card, struct mmc_rpmb_data *rpmb; /* This creates the minor number for the RPMB char device */ - devidx = ida_simple_get(&mmc_rpmb_ida, 0, max_devices, GFP_KERNEL); + devidx = ida_alloc_max(&mmc_rpmb_ida, max_devices - 1, GFP_KERNEL); if (devidx < 0) return devidx; rpmb = kzalloc(sizeof(*rpmb), GFP_KERNEL); if (!rpmb) { - ida_simple_remove(&mmc_rpmb_ida, devidx); + ida_free(&mmc_rpmb_ida, devidx); return -ENOMEM; } diff --git a/drivers/mmc/core/host.c b/drivers/mmc/core/host.c index cf396e8f34e986..7cc9a33d28cacb 100644 --- a/drivers/mmc/core/host.c +++ b/drivers/mmc/core/host.c @@ -76,7 +76,7 @@ static void mmc_host_classdev_release(struct device *dev) struct mmc_host *host = cls_dev_to_mmc_host(dev); wakeup_source_unregister(host->ws); if (of_alias_get_id(host->parent->of_node, "mmc") < 0) - ida_simple_remove(&mmc_host_ida, host->index); + ida_free(&mmc_host_ida, host->index); kfree(host); } @@ -538,7 +538,8 @@ struct mmc_host *mmc_alloc_host(int extra, struct device *dev) min_idx = mmc_first_nonreserved_index(); max_idx = 0; - index = ida_simple_get(&mmc_host_ida, min_idx, max_idx, GFP_KERNEL); + index = ida_alloc_range(&mmc_host_ida, min_idx, max_idx - 1, + GFP_KERNEL); if (index < 0) { kfree(host); return NULL; From 05c7b901a03f5ac745ebd8205b1400ce26606588 Mon Sep 17 00:00:00 2001 From: Peng Fan Date: Mon, 22 Jan 2024 17:16:23 +0800 Subject: [PATCH 338/707] dt-bindings: mmc: fsl-imx-esdhc: add i.MX95 compatible string Same as i.MX93, add i.MX95 SDHC which is compatible with i.MX8MM USDHC. Signed-off-by: Peng Fan Link: https://lore.kernel.org/r/20240122091623.2078089-1-peng.fan@oss.nxp.com Signed-off-by: Ulf Hansson --- Documentation/devicetree/bindings/mmc/fsl-imx-esdhc.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/mmc/fsl-imx-esdhc.yaml b/Documentation/devicetree/bindings/mmc/fsl-imx-esdhc.yaml index 82eb7a24c85782..f3c5aa64affcb4 100644 --- a/Documentation/devicetree/bindings/mmc/fsl-imx-esdhc.yaml +++ b/Documentation/devicetree/bindings/mmc/fsl-imx-esdhc.yaml @@ -55,8 +55,9 @@ properties: - enum: - fsl,imx8mn-usdhc - fsl,imx8mp-usdhc - - fsl,imx93-usdhc - fsl,imx8ulp-usdhc + - fsl,imx93-usdhc + - fsl,imx95-usdhc - const: fsl,imx8mm-usdhc - items: - enum: From 4e99ffb173faaf38f010acb369bff57a20e9e531 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Thu, 25 Jan 2024 09:50:23 +0100 Subject: [PATCH 339/707] mmc: core Drop BLK_BOUNCE_HIGH The MMC core sets BLK_BOUNCE_HIGH for devices where dma_mask is unassigned. For the majority of MMC hosts this path is never taken: the OF core will unconditionally assign a 32-bit mask to any OF device, and most MMC hosts are probed from device tree, see drivers/of/platform.c: of_platform_device_create_pdata() dev->dev.coherent_dma_mask = DMA_BIT_MASK(32); if (!dev->dev.dma_mask) dev->dev.dma_mask = &dev->dev.coherent_dma_mask; of_amba_device_create() dev->dev.coherent_dma_mask = DMA_BIT_MASK(32); dev->dev.dma_mask = &dev->dev.coherent_dma_mask; MMC devices that are probed from ACPI or PCI will likewise have a proper dma_mask assigned. The only remaining devices that could have a blank dma_mask are platform devices instantiated from board files. These are mostly used on systems without CONFIG_HIGHMEM enabled which means the block layer will not bounce, and in the few cases where it is enabled it is not used anyway: for example some OMAP2 systems such as Nokia n800/n810 will create a platform_device and not assign a dma_mask, however they do not have any highmem, so no bouncing will happen anyway: the block core checks if max_low_pfn >= max_pfn and this will always be false. Should it turn out there is a platform_device with blank DMA mask actually using CONFIG_HIGHMEM somewhere out there we should set dma_mask for it, not do this trickery. Signed-off-by: Linus Walleij Acked-by: Arnd Bergmann Acked-by: Christoph Hellwig Link: https://lore.kernel.org/r/20240125-mmc-no-blk-bounce-high-v1-1-d0f92a30e085@linaro.org Signed-off-by: Ulf Hansson --- drivers/mmc/core/queue.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/mmc/core/queue.c b/drivers/mmc/core/queue.c index a0a2412f62a730..316415588a77ce 100644 --- a/drivers/mmc/core/queue.c +++ b/drivers/mmc/core/queue.c @@ -351,8 +351,6 @@ static void mmc_setup_queue(struct mmc_queue *mq, struct mmc_card *card) if (mmc_can_erase(card)) mmc_queue_setup_discard(mq->queue, card); - if (!mmc_dev(host)->dma_mask || !*mmc_dev(host)->dma_mask) - blk_queue_bounce_limit(mq->queue, BLK_BOUNCE_HIGH); blk_queue_max_hw_sectors(mq->queue, min(host->max_blk_count, host->max_req_size / 512)); if (host->can_dma_map_merge) From f73c3a862847b49e0643bf554962d980526abfd6 Mon Sep 17 00:00:00 2001 From: Yuezhang Mo Date: Tue, 23 Jan 2024 17:12:47 +0800 Subject: [PATCH 340/707] exfat: fix file not locking when writing zeros in exfat_file_mmap() inode->i_rwsem should be locked when writing file. But the lock is missing when writing zeros to the file in exfat_file_mmap(). Fixes: 11a347fb6cef ("exfat: change to get file size from DataLength") Signed-off-by: Yuezhang Mo Signed-off-by: Namjae Jeon --- fs/exfat/file.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/exfat/file.c b/fs/exfat/file.c index d25a96a148af4c..473c1641d50d50 100644 --- a/fs/exfat/file.c +++ b/fs/exfat/file.c @@ -613,7 +613,11 @@ static int exfat_file_mmap(struct file *file, struct vm_area_struct *vma) start + vma->vm_end - vma->vm_start); if ((vma->vm_flags & VM_WRITE) && ei->valid_size < end) { + if (!inode_trylock(inode)) + return -EAGAIN; + ret = exfat_file_zeroed_range(file, ei->valid_size, end); + inode_unlock(inode); if (ret < 0) { exfat_err(inode->i_sb, "mmap: fail to zero from %llu to %llu(%d)", From 8b29fa18400ccb7fb681f105d74b2cabb59e5d62 Mon Sep 17 00:00:00 2001 From: Yuezhang Mo Date: Tue, 30 Jan 2024 12:46:21 +0800 Subject: [PATCH 341/707] exfat: ratelimit error msg in exfat_file_mmap() Ratelimit the error message of zeroing out data between the valid size and the file size in exfat_file_mmap() to not flood dmesg. Signed-off-by: Yuezhang Mo Signed-off-by: Namjae Jeon --- fs/exfat/exfat_fs.h | 5 +++++ fs/exfat/file.c | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h index 9474cd50da6d4f..46f2760d984644 100644 --- a/fs/exfat/exfat_fs.h +++ b/fs/exfat/exfat_fs.h @@ -542,6 +542,11 @@ void __exfat_fs_error(struct super_block *sb, int report, const char *fmt, ...) /* expand to pr_*() with prefix */ #define exfat_err(sb, fmt, ...) \ pr_err("exFAT-fs (%s): " fmt "\n", (sb)->s_id, ##__VA_ARGS__) +#define exfat_err_ratelimit(sb, fmt, args...) \ + do { \ + if (__ratelimit(&EXFAT_SB(sb)->ratelimit)) \ + exfat_err(sb, fmt, ## args); \ + } while (0) #define exfat_warn(sb, fmt, ...) \ pr_warn("exFAT-fs (%s): " fmt "\n", (sb)->s_id, ##__VA_ARGS__) #define exfat_info(sb, fmt, ...) \ diff --git a/fs/exfat/file.c b/fs/exfat/file.c index 473c1641d50d50..68405ae06772d4 100644 --- a/fs/exfat/file.c +++ b/fs/exfat/file.c @@ -619,7 +619,7 @@ static int exfat_file_mmap(struct file *file, struct vm_area_struct *vma) ret = exfat_file_zeroed_range(file, ei->valid_size, end); inode_unlock(inode); if (ret < 0) { - exfat_err(inode->i_sb, + exfat_err_ratelimit(inode->i_sb, "mmap: fail to zero from %llu to %llu(%d)", start, end, ret); return ret; From 90a7463fae9eb9f68a6e4ff3e8868beb8fbfc649 Mon Sep 17 00:00:00 2001 From: Duy Nguyen Date: Thu, 25 Jan 2024 16:34:37 +0100 Subject: [PATCH 342/707] pmdomain: renesas: r8a779h0-sysc: Add r8a779h0 support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add support for R-Car V4M (R8A779H0) SoC power areas to the R-Car SYSC driver. Signed-off-by: Duy Nguyen Signed-off-by: Geert Uytterhoeven Acked-by: Niklas Söderlund Link: https://lore.kernel.org/r/eed6faa02c628d32676ab8ea0eee636b4ffd6c47.1706194617.git.geert+renesas@glider.be Signed-off-by: Ulf Hansson --- drivers/pmdomain/renesas/Kconfig | 4 ++ drivers/pmdomain/renesas/Makefile | 1 + drivers/pmdomain/renesas/r8a779h0-sysc.c | 54 +++++++++++++++++++++++ drivers/pmdomain/renesas/rcar-gen4-sysc.c | 3 ++ drivers/pmdomain/renesas/rcar-gen4-sysc.h | 1 + 5 files changed, 63 insertions(+) create mode 100644 drivers/pmdomain/renesas/r8a779h0-sysc.c diff --git a/drivers/pmdomain/renesas/Kconfig b/drivers/pmdomain/renesas/Kconfig index 80bf2cf8b60e6f..54acb4b1ec7c48 100644 --- a/drivers/pmdomain/renesas/Kconfig +++ b/drivers/pmdomain/renesas/Kconfig @@ -71,6 +71,10 @@ config SYSC_R8A779G0 bool "System Controller support for R-Car V4H" if COMPILE_TEST select SYSC_RCAR_GEN4 +config SYSC_R8A779H0 + bool "System Controller support for R-Car V4M" if COMPILE_TEST + select SYSC_RCAR_GEN4 + config SYSC_RMOBILE bool "System Controller support for R-Mobile" if COMPILE_TEST diff --git a/drivers/pmdomain/renesas/Makefile b/drivers/pmdomain/renesas/Makefile index e306e396fc8c10..89180f19c23be7 100644 --- a/drivers/pmdomain/renesas/Makefile +++ b/drivers/pmdomain/renesas/Makefile @@ -24,6 +24,7 @@ obj-$(CONFIG_SYSC_R8A77995) += r8a77995-sysc.o obj-$(CONFIG_SYSC_R8A779A0) += r8a779a0-sysc.o obj-$(CONFIG_SYSC_R8A779F0) += r8a779f0-sysc.o obj-$(CONFIG_SYSC_R8A779G0) += r8a779g0-sysc.o +obj-$(CONFIG_SYSC_R8A779H0) += r8a779h0-sysc.o # Family obj-$(CONFIG_SYSC_RCAR) += rcar-sysc.o obj-$(CONFIG_SYSC_RCAR_GEN4) += rcar-gen4-sysc.o diff --git a/drivers/pmdomain/renesas/r8a779h0-sysc.c b/drivers/pmdomain/renesas/r8a779h0-sysc.c new file mode 100644 index 00000000000000..e13372cb80eea8 --- /dev/null +++ b/drivers/pmdomain/renesas/r8a779h0-sysc.c @@ -0,0 +1,54 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Renesas R-Car V4M System Controller + * + * Copyright (C) 2023 Renesas Electronics Corp + */ + +#include + +#include + +#include "rcar-gen4-sysc.h" + +static struct rcar_gen4_sysc_area r8a779h0_areas[] __initdata = { + { "always-on", R8A779H0_PD_ALWAYS_ON, -1, PD_ALWAYS_ON }, + { "c4", R8A779H0_PD_C4, R8A779H0_PD_ALWAYS_ON }, + { "a2e0d0", R8A779H0_PD_A2E0D0, R8A779H0_PD_C4, PD_SCU }, + { "a1e0d0c0", R8A779H0_PD_A1E0D0C0, R8A779H0_PD_A2E0D0, PD_CPU_NOCR }, + { "a1e0d0c1", R8A779H0_PD_A1E0D0C1, R8A779H0_PD_A2E0D0, PD_CPU_NOCR }, + { "a1e0d0c2", R8A779H0_PD_A1E0D0C2, R8A779H0_PD_A2E0D0, PD_CPU_NOCR }, + { "a1e0d0c3", R8A779H0_PD_A1E0D0C3, R8A779H0_PD_A2E0D0, PD_CPU_NOCR }, + { "a3cr0", R8A779H0_PD_A3CR0, R8A779H0_PD_ALWAYS_ON, PD_CPU_NOCR }, + { "a3cr1", R8A779H0_PD_A3CR1, R8A779H0_PD_ALWAYS_ON, PD_CPU_NOCR }, + { "a3cr2", R8A779H0_PD_A3CR2, R8A779H0_PD_ALWAYS_ON, PD_CPU_NOCR }, + { "a33dga", R8A779H0_PD_A33DGA, R8A779H0_PD_C4 }, + { "a23dgb", R8A779H0_PD_A23DGB, R8A779H0_PD_A33DGA }, + { "a3vip0", R8A779H0_PD_A3VIP0, R8A779H0_PD_C4 }, + { "a3vip2", R8A779H0_PD_A3VIP2, R8A779H0_PD_C4 }, + { "a3dul", R8A779H0_PD_A3DUL, R8A779H0_PD_C4 }, + { "a3isp0", R8A779H0_PD_A3ISP0, R8A779H0_PD_C4 }, + { "a2cn0", R8A779H0_PD_A2CN0, R8A779H0_PD_C4 }, + { "a1cn0", R8A779H0_PD_A1CN0, R8A779H0_PD_A2CN0 }, + { "a1dsp0", R8A779H0_PD_A1DSP0, R8A779H0_PD_A2CN0 }, + { "a1dsp1", R8A779H0_PD_A1DSP1, R8A779H0_PD_A2CN0 }, + { "a2imp01", R8A779H0_PD_A2IMP01, R8A779H0_PD_C4 }, + { "a2psc", R8A779H0_PD_A2PSC, R8A779H0_PD_C4 }, + { "a2dma", R8A779H0_PD_A2DMA, R8A779H0_PD_C4 }, + { "a2cv0", R8A779H0_PD_A2CV0, R8A779H0_PD_C4 }, + { "a2cv1", R8A779H0_PD_A2CV1, R8A779H0_PD_C4 }, + { "a2cv2", R8A779H0_PD_A2CV2, R8A779H0_PD_C4 }, + { "a2cv3", R8A779H0_PD_A2CV3, R8A779H0_PD_C4 }, + { "a3imr0", R8A779H0_PD_A3IMR0, R8A779H0_PD_C4 }, + { "a3imr1", R8A779H0_PD_A3IMR1, R8A779H0_PD_C4 }, + { "a3imr2", R8A779H0_PD_A3IMR2, R8A779H0_PD_C4 }, + { "a3imr3", R8A779H0_PD_A3IMR3, R8A779H0_PD_C4 }, + { "a3vc", R8A779H0_PD_A3VC, R8A779H0_PD_C4 }, + { "a3pci", R8A779H0_PD_A3PCI, R8A779H0_PD_C4 }, + { "a2pciphy", R8A779H0_PD_A2PCIPHY, R8A779H0_PD_A3PCI }, +}; + +const struct rcar_gen4_sysc_info r8a779h0_sysc_info __initconst = { + .areas = r8a779h0_areas, + .num_areas = ARRAY_SIZE(r8a779h0_areas), +}; diff --git a/drivers/pmdomain/renesas/rcar-gen4-sysc.c b/drivers/pmdomain/renesas/rcar-gen4-sysc.c index 9e5e6e077abc08..728248659a97e8 100644 --- a/drivers/pmdomain/renesas/rcar-gen4-sysc.c +++ b/drivers/pmdomain/renesas/rcar-gen4-sysc.c @@ -284,6 +284,9 @@ static const struct of_device_id rcar_gen4_sysc_matches[] __initconst = { #endif #ifdef CONFIG_SYSC_R8A779G0 { .compatible = "renesas,r8a779g0-sysc", .data = &r8a779g0_sysc_info }, +#endif +#ifdef CONFIG_SYSC_R8A779H0 + { .compatible = "renesas,r8a779h0-sysc", .data = &r8a779h0_sysc_info }, #endif { /* sentinel */ } }; diff --git a/drivers/pmdomain/renesas/rcar-gen4-sysc.h b/drivers/pmdomain/renesas/rcar-gen4-sysc.h index 388cfa8f8f9fd6..fdf843aa51134f 100644 --- a/drivers/pmdomain/renesas/rcar-gen4-sysc.h +++ b/drivers/pmdomain/renesas/rcar-gen4-sysc.h @@ -40,5 +40,6 @@ struct rcar_gen4_sysc_info { extern const struct rcar_gen4_sysc_info r8a779a0_sysc_info; extern const struct rcar_gen4_sysc_info r8a779f0_sysc_info; extern const struct rcar_gen4_sysc_info r8a779g0_sysc_info; +extern const struct rcar_gen4_sysc_info r8a779h0_sysc_info; #endif /* __SOC_RENESAS_RCAR_GEN4_SYSC_H__ */ From 5a774e241fdd2e90fd3562a6a9aaa822c848ec71 Mon Sep 17 00:00:00 2001 From: Patrick Rudolph Date: Tue, 30 Jan 2024 20:59:03 +0530 Subject: [PATCH 343/707] hwmon: (pmbus_core) Allow to hook PMBUS_SMBALERT_MASK Use _pmbus_write_word_data to allow intercepting writes to PMBUS_SMBALERT_MASK in the custom chip specific code. This is required for MP2971/MP2973 which doesn't follow the PMBUS specification for PMBUS_SMBALERT_MASK. Signed-off-by: Patrick Rudolph Signed-off-by: Naresh Solanki Link: https://lore.kernel.org/r/20240130152903.3651341-1-naresh.solanki@9elements.com Signed-off-by: Guenter Roeck --- drivers/hwmon/pmbus/pmbus_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hwmon/pmbus/pmbus_core.c b/drivers/hwmon/pmbus/pmbus_core.c index 1363d9f89181d2..cb4c65a7f288c1 100644 --- a/drivers/hwmon/pmbus/pmbus_core.c +++ b/drivers/hwmon/pmbus/pmbus_core.c @@ -3188,7 +3188,7 @@ static int pmbus_regulator_notify(struct pmbus_data *data, int page, int event) static int pmbus_write_smbalert_mask(struct i2c_client *client, u8 page, u8 reg, u8 val) { - return pmbus_write_word_data(client, page, PMBUS_SMBALERT_MASK, reg | (val << 8)); + return _pmbus_write_word_data(client, page, PMBUS_SMBALERT_MASK, reg | (val << 8)); } static irqreturn_t pmbus_fault_handler(int irq, void *pdata) From 567f7205dd7a0e168314b480e4bd80c77cbe71cb Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin (Intel)" Date: Tue, 5 Dec 2023 02:49:58 -0800 Subject: [PATCH 344/707] x86/opcode: Add ERET[US] instructions to the x86 opcode map ERETU returns from an event handler while making a transition to ring 3, and ERETS returns from an event handler while staying in ring 0. Add instruction opcodes used by ERET[US] to the x86 opcode map; opcode numbers are per FRED spec v5.0. Signed-off-by: H. Peter Anvin (Intel) Signed-off-by: Xin Li Signed-off-by: Thomas Gleixner Tested-by: Shan Kang Reviewed-by: Masami Hiramatsu (Google) Link: https://lore.kernel.org/r/20231205105030.8698-10-xin3.li@intel.com --- arch/x86/lib/x86-opcode-map.txt | 2 +- tools/arch/x86/lib/x86-opcode-map.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt index 1efe1d9bf5ce4b..12af572201a29f 100644 --- a/arch/x86/lib/x86-opcode-map.txt +++ b/arch/x86/lib/x86-opcode-map.txt @@ -1052,7 +1052,7 @@ EndTable GrpTable: Grp7 0: SGDT Ms | VMCALL (001),(11B) | VMLAUNCH (010),(11B) | VMRESUME (011),(11B) | VMXOFF (100),(11B) | PCONFIG (101),(11B) | ENCLV (000),(11B) | WRMSRNS (110),(11B) -1: SIDT Ms | MONITOR (000),(11B) | MWAIT (001),(11B) | CLAC (010),(11B) | STAC (011),(11B) | ENCLS (111),(11B) +1: SIDT Ms | MONITOR (000),(11B) | MWAIT (001),(11B) | CLAC (010),(11B) | STAC (011),(11B) | ENCLS (111),(11B) | ERETU (F3),(010),(11B) | ERETS (F2),(010),(11B) 2: LGDT Ms | XGETBV (000),(11B) | XSETBV (001),(11B) | VMFUNC (100),(11B) | XEND (101)(11B) | XTEST (110)(11B) | ENCLU (111),(11B) 3: LIDT Ms 4: SMSW Mw/Rv diff --git a/tools/arch/x86/lib/x86-opcode-map.txt b/tools/arch/x86/lib/x86-opcode-map.txt index 1efe1d9bf5ce4b..12af572201a29f 100644 --- a/tools/arch/x86/lib/x86-opcode-map.txt +++ b/tools/arch/x86/lib/x86-opcode-map.txt @@ -1052,7 +1052,7 @@ EndTable GrpTable: Grp7 0: SGDT Ms | VMCALL (001),(11B) | VMLAUNCH (010),(11B) | VMRESUME (011),(11B) | VMXOFF (100),(11B) | PCONFIG (101),(11B) | ENCLV (000),(11B) | WRMSRNS (110),(11B) -1: SIDT Ms | MONITOR (000),(11B) | MWAIT (001),(11B) | CLAC (010),(11B) | STAC (011),(11B) | ENCLS (111),(11B) +1: SIDT Ms | MONITOR (000),(11B) | MWAIT (001),(11B) | CLAC (010),(11B) | STAC (011),(11B) | ENCLS (111),(11B) | ERETU (F3),(010),(11B) | ERETS (F2),(010),(11B) 2: LGDT Ms | XGETBV (000),(11B) | XSETBV (001),(11B) | VMFUNC (100),(11B) | XEND (101)(11B) | XTEST (110)(11B) | ENCLU (111),(11B) 3: LIDT Ms 4: SMSW Mw/Rv From 379ae086a73c804df39866d28eb4ce693e7af486 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin (Intel)" Date: Tue, 5 Dec 2023 02:49:59 -0800 Subject: [PATCH 345/707] x86/objtool: Teach objtool about ERET[US] Update the objtool decoder to know about the ERET[US] instructions (type INSN_CONTEXT_SWITCH). Signed-off-by: H. Peter Anvin (Intel) Signed-off-by: Xin Li Signed-off-by: Thomas Gleixner Tested-by: Shan Kang Link: https://lore.kernel.org/r/20231205105030.8698-11-xin3.li@intel.com --- tools/objtool/arch/x86/decode.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c index e327cd82713522..3a1d80a7878d33 100644 --- a/tools/objtool/arch/x86/decode.c +++ b/tools/objtool/arch/x86/decode.c @@ -509,11 +509,20 @@ int arch_decode_instruction(struct objtool_file *file, const struct section *sec if (op2 == 0x01) { - if (modrm == 0xca) - insn->type = INSN_CLAC; - else if (modrm == 0xcb) - insn->type = INSN_STAC; - + switch (insn_last_prefix_id(&ins)) { + case INAT_PFX_REPE: + case INAT_PFX_REPNE: + if (modrm == 0xca) + /* eretu/erets */ + insn->type = INSN_CONTEXT_SWITCH; + break; + default: + if (modrm == 0xca) + insn->type = INSN_CLAC; + else if (modrm == 0xcb) + insn->type = INSN_STAC; + break; + } } else if (op2 >= 0x80 && op2 <= 0x8f) { insn->type = INSN_JUMP_CONDITIONAL; From 95d34efac1a0aff6da00ad177168d99c60a3b7cd Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin (Intel)" Date: Tue, 5 Dec 2023 02:50:00 -0800 Subject: [PATCH 346/707] x86/cpu: Add X86_CR4_FRED macro Add X86_CR4_FRED macro for the FRED bit in %cr4. This bit must not be changed after initialization, so add it to the pinned CR4 bits. Signed-off-by: H. Peter Anvin (Intel) Signed-off-by: Xin Li Signed-off-by: Thomas Gleixner Tested-by: Shan Kang Link: https://lore.kernel.org/r/20231205105030.8698-12-xin3.li@intel.com --- arch/x86/include/uapi/asm/processor-flags.h | 7 +++++++ arch/x86/kernel/cpu/common.c | 5 ++--- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h index d898432947ff35..f1a4adc782720b 100644 --- a/arch/x86/include/uapi/asm/processor-flags.h +++ b/arch/x86/include/uapi/asm/processor-flags.h @@ -139,6 +139,13 @@ #define X86_CR4_LAM_SUP_BIT 28 /* LAM for supervisor pointers */ #define X86_CR4_LAM_SUP _BITUL(X86_CR4_LAM_SUP_BIT) +#ifdef __x86_64__ +#define X86_CR4_FRED_BIT 32 /* enable FRED kernel entry */ +#define X86_CR4_FRED _BITUL(X86_CR4_FRED_BIT) +#else +#define X86_CR4_FRED (0) +#endif + /* * x86-64 Task Priority Register, CR8 */ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 0b97bcde70c610..c3a175770df032 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -382,9 +382,8 @@ static __always_inline void setup_umip(struct cpuinfo_x86 *c) } /* These bits should not change their value after CPU init is finished. */ -static const unsigned long cr4_pinned_mask = - X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_UMIP | - X86_CR4_FSGSBASE | X86_CR4_CET; +static const unsigned long cr4_pinned_mask = X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_UMIP | + X86_CR4_FSGSBASE | X86_CR4_CET | X86_CR4_FRED; static DEFINE_STATIC_KEY_FALSE_RO(cr_pinning); static unsigned long cr4_pinned_bits __ro_after_init; From 0b2e6c1c724fc7b72c86a72d49b16c7617d6f5f9 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin (Intel)" Date: Tue, 5 Dec 2023 02:50:01 -0800 Subject: [PATCH 347/707] x86/cpu: Add MSR numbers for FRED configuration Add MSR numbers for the FRED configuration registers per FRED spec 5.0. Originally-by: Megha Dey Signed-off-by: H. Peter Anvin (Intel) Signed-off-by: Xin Li Signed-off-by: Thomas Gleixner Tested-by: Shan Kang Link: https://lore.kernel.org/r/20231205105030.8698-13-xin3.li@intel.com --- arch/x86/include/asm/msr-index.h | 13 ++++++++++++- tools/arch/x86/include/asm/msr-index.h | 13 ++++++++++++- 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index f1bd7b91b3c637..1f9dc9bd13eb7e 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -36,8 +36,19 @@ #define EFER_FFXSR (1<<_EFER_FFXSR) #define EFER_AUTOIBRS (1<<_EFER_AUTOIBRS) -/* Intel MSRs. Some also available on other CPUs */ +/* FRED MSRs */ +#define MSR_IA32_FRED_RSP0 0x1cc /* Level 0 stack pointer */ +#define MSR_IA32_FRED_RSP1 0x1cd /* Level 1 stack pointer */ +#define MSR_IA32_FRED_RSP2 0x1ce /* Level 2 stack pointer */ +#define MSR_IA32_FRED_RSP3 0x1cf /* Level 3 stack pointer */ +#define MSR_IA32_FRED_STKLVLS 0x1d0 /* Exception stack levels */ +#define MSR_IA32_FRED_SSP0 MSR_IA32_PL0_SSP /* Level 0 shadow stack pointer */ +#define MSR_IA32_FRED_SSP1 0x1d1 /* Level 1 shadow stack pointer */ +#define MSR_IA32_FRED_SSP2 0x1d2 /* Level 2 shadow stack pointer */ +#define MSR_IA32_FRED_SSP3 0x1d3 /* Level 3 shadow stack pointer */ +#define MSR_IA32_FRED_CONFIG 0x1d4 /* Entrypoint and interrupt stack level */ +/* Intel MSRs. Some also available on other CPUs */ #define MSR_TEST_CTRL 0x00000033 #define MSR_TEST_CTRL_SPLIT_LOCK_DETECT_BIT 29 #define MSR_TEST_CTRL_SPLIT_LOCK_DETECT BIT(MSR_TEST_CTRL_SPLIT_LOCK_DETECT_BIT) diff --git a/tools/arch/x86/include/asm/msr-index.h b/tools/arch/x86/include/asm/msr-index.h index 1d51e1850ed03d..74f2c63ce71728 100644 --- a/tools/arch/x86/include/asm/msr-index.h +++ b/tools/arch/x86/include/asm/msr-index.h @@ -36,8 +36,19 @@ #define EFER_FFXSR (1<<_EFER_FFXSR) #define EFER_AUTOIBRS (1<<_EFER_AUTOIBRS) -/* Intel MSRs. Some also available on other CPUs */ +/* FRED MSRs */ +#define MSR_IA32_FRED_RSP0 0x1cc /* Level 0 stack pointer */ +#define MSR_IA32_FRED_RSP1 0x1cd /* Level 1 stack pointer */ +#define MSR_IA32_FRED_RSP2 0x1ce /* Level 2 stack pointer */ +#define MSR_IA32_FRED_RSP3 0x1cf /* Level 3 stack pointer */ +#define MSR_IA32_FRED_STKLVLS 0x1d0 /* Exception stack levels */ +#define MSR_IA32_FRED_SSP0 MSR_IA32_PL0_SSP /* Level 0 shadow stack pointer */ +#define MSR_IA32_FRED_SSP1 0x1d1 /* Level 1 shadow stack pointer */ +#define MSR_IA32_FRED_SSP2 0x1d2 /* Level 2 shadow stack pointer */ +#define MSR_IA32_FRED_SSP3 0x1d3 /* Level 3 shadow stack pointer */ +#define MSR_IA32_FRED_CONFIG 0x1d4 /* Entrypoint and interrupt stack level */ +/* Intel MSRs. Some also available on other CPUs */ #define MSR_TEST_CTRL 0x00000033 #define MSR_TEST_CTRL_SPLIT_LOCK_DETECT_BIT 29 #define MSR_TEST_CTRL_SPLIT_LOCK_DETECT BIT(MSR_TEST_CTRL_SPLIT_LOCK_DETECT_BIT) From ed262541af195f452c43cd4f28310a09065039ec Mon Sep 17 00:00:00 2001 From: Xin Li Date: Tue, 5 Dec 2023 02:50:02 -0800 Subject: [PATCH 348/707] x86/ptrace: Cleanup the definition of the pt_regs structure struct pt_regs is hard to read because the member or section related comments are not aligned with the members. The 'cs' and 'ss' members of pt_regs are type of 'unsigned long' while in reality they are only 16-bit wide. This works so far as the remaining space is unused, but FRED will use the remaining bits for other purposes. To prepare for FRED: - Cleanup the formatting - Convert 'cs' and 'ss' to u16 and embed them into an union with a u64 - Fixup the related printk() format strings Suggested-by: Thomas Gleixner Originally-by: H. Peter Anvin (Intel) Signed-off-by: Xin Li Signed-off-by: Thomas Gleixner Tested-by: Shan Kang Link: https://lore.kernel.org/r/20231205105030.8698-14-xin3.li@intel.com --- arch/x86/entry/vsyscall/vsyscall_64.c | 2 +- arch/x86/include/asm/ptrace.h | 48 +++++++++++++++++++-------- arch/x86/kernel/process_64.c | 2 +- 3 files changed, 37 insertions(+), 15 deletions(-) diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c index e0ca8120aea876..a3c0df11d0e6d8 100644 --- a/arch/x86/entry/vsyscall/vsyscall_64.c +++ b/arch/x86/entry/vsyscall/vsyscall_64.c @@ -76,7 +76,7 @@ static void warn_bad_vsyscall(const char *level, struct pt_regs *regs, if (!show_unhandled_signals) return; - printk_ratelimited("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n", + printk_ratelimited("%s%s[%d] %s ip:%lx cs:%x sp:%lx ax:%lx si:%lx di:%lx\n", level, current->comm, task_pid_nr(current), message, regs->ip, regs->cs, regs->sp, regs->ax, regs->si, regs->di); diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index f4db78b09c8f0b..b268cd2a2d01c4 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -57,17 +57,19 @@ struct pt_regs { #else /* __i386__ */ struct pt_regs { -/* - * C ABI says these regs are callee-preserved. They aren't saved on kernel entry - * unless syscall needs a complete, fully filled "struct pt_regs". - */ + /* + * C ABI says these regs are callee-preserved. They aren't saved on + * kernel entry unless syscall needs a complete, fully filled + * "struct pt_regs". + */ unsigned long r15; unsigned long r14; unsigned long r13; unsigned long r12; unsigned long bp; unsigned long bx; -/* These regs are callee-clobbered. Always saved on kernel entry. */ + + /* These regs are callee-clobbered. Always saved on kernel entry. */ unsigned long r11; unsigned long r10; unsigned long r9; @@ -77,18 +79,38 @@ struct pt_regs { unsigned long dx; unsigned long si; unsigned long di; -/* - * On syscall entry, this is syscall#. On CPU exception, this is error code. - * On hw interrupt, it's IRQ number: - */ + + /* + * orig_ax is used on entry for: + * - the syscall number (syscall, sysenter, int80) + * - error_code stored by the CPU on traps and exceptions + * - the interrupt number for device interrupts + */ unsigned long orig_ax; -/* Return frame for iretq */ + + /* The IRETQ return frame starts here */ unsigned long ip; - unsigned long cs; + + union { + /* The full 64-bit data slot containing CS */ + u64 csx; + /* CS selector */ + u16 cs; + }; + unsigned long flags; unsigned long sp; - unsigned long ss; -/* top of stack page */ + + union { + /* The full 64-bit data slot containing SS */ + u64 ssx; + /* SS selector */ + u16 ss; + }; + + /* + * Top of stack on IDT systems. + */ }; #endif /* !__i386__ */ diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 33b268747bb7bb..0f78b58021bb2a 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -117,7 +117,7 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode, printk("%sFS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", log_lvl, fs, fsindex, gs, gsindex, shadowgs); - printk("%sCS: %04lx DS: %04x ES: %04x CR0: %016lx\n", + printk("%sCS: %04x DS: %04x ES: %04x CR0: %016lx\n", log_lvl, regs->cs, ds, es, cr0); printk("%sCR2: %016lx CR3: %016lx CR4: %016lx\n", log_lvl, cr2, cr3, cr4); From c125443456e97f7bcc87cc7ba1346c2b92c4db94 Mon Sep 17 00:00:00 2001 From: Xin Li Date: Tue, 5 Dec 2023 02:50:03 -0800 Subject: [PATCH 349/707] x86/ptrace: Add FRED additional information to the pt_regs structure FRED defines additional information in the upper 48 bits of cs/ss fields. Therefore add the information definitions into the pt_regs structure. Specifically introduce a new structure fred_ss to denote the FRED flags above SS selector, which avoids FRED_SSX_ macros and makes the code simpler and easier to read. Suggested-by: Thomas Gleixner Originally-by: H. Peter Anvin (Intel) Signed-off-by: Xin Li Signed-off-by: Thomas Gleixner Tested-by: Shan Kang Link: https://lore.kernel.org/r/20231205105030.8698-15-xin3.li@intel.com --- arch/x86/include/asm/ptrace.h | 66 ++++++++++++++++++++++++++++++++--- 1 file changed, 61 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index b268cd2a2d01c4..5a83fbd9bc0b44 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -56,6 +56,50 @@ struct pt_regs { #else /* __i386__ */ +struct fred_cs { + /* CS selector */ + u64 cs : 16, + /* Stack level at event time */ + sl : 2, + /* IBT in WAIT_FOR_ENDBRANCH state */ + wfe : 1, + : 45; +}; + +struct fred_ss { + /* SS selector */ + u64 ss : 16, + /* STI state */ + sti : 1, + /* Set if syscall, sysenter or INT n */ + swevent : 1, + /* Event is NMI type */ + nmi : 1, + : 13, + /* Event vector */ + vector : 8, + : 8, + /* Event type */ + type : 4, + : 4, + /* Event was incident to enclave execution */ + enclave : 1, + /* CPU was in long mode */ + lm : 1, + /* + * Nested exception during FRED delivery, not set + * for #DF. + */ + nested : 1, + : 1, + /* + * The length of the instruction causing the event. + * Only set for INTO, INT1, INT3, INT n, SYSCALL + * and SYSENTER. 0 otherwise. + */ + insnlen : 4; +}; + struct pt_regs { /* * C ABI says these regs are callee-preserved. They aren't saved on @@ -85,6 +129,12 @@ struct pt_regs { * - the syscall number (syscall, sysenter, int80) * - error_code stored by the CPU on traps and exceptions * - the interrupt number for device interrupts + * + * A FRED stack frame starts here: + * 1) It _always_ includes an error code; + * + * 2) The return frame for ERET[US] starts here, but + * the content of orig_ax is ignored. */ unsigned long orig_ax; @@ -92,24 +142,30 @@ struct pt_regs { unsigned long ip; union { - /* The full 64-bit data slot containing CS */ - u64 csx; /* CS selector */ u16 cs; + /* The extended 64-bit data slot containing CS */ + u64 csx; + /* The FRED CS extension */ + struct fred_cs fred_cs; }; unsigned long flags; unsigned long sp; union { - /* The full 64-bit data slot containing SS */ - u64 ssx; /* SS selector */ u16 ss; + /* The extended 64-bit data slot containing SS */ + u64 ssx; + /* The FRED SS extension */ + struct fred_ss fred_ss; }; /* - * Top of stack on IDT systems. + * Top of stack on IDT systems, while FRED systems have extra fields + * defined above for storing exception related information, e.g. CR2 or + * DR6. */ }; From c413db75cb7db9740330f3375d1854994bd0c8cb Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin (Intel)" Date: Tue, 5 Dec 2023 02:50:04 -0800 Subject: [PATCH 350/707] x86/fred: Add a new header file for FRED definitions Add a header file for FRED prototypes and definitions. Signed-off-by: H. Peter Anvin (Intel) Signed-off-by: Xin Li Signed-off-by: Thomas Gleixner Tested-by: Shan Kang Link: https://lore.kernel.org/r/20231205105030.8698-16-xin3.li@intel.com --- arch/x86/include/asm/fred.h | 68 +++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100644 arch/x86/include/asm/fred.h diff --git a/arch/x86/include/asm/fred.h b/arch/x86/include/asm/fred.h new file mode 100644 index 00000000000000..f514fdb5a39f73 --- /dev/null +++ b/arch/x86/include/asm/fred.h @@ -0,0 +1,68 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Macros for Flexible Return and Event Delivery (FRED) + */ + +#ifndef ASM_X86_FRED_H +#define ASM_X86_FRED_H + +#include + +#include + +/* + * FRED event return instruction opcodes for ERET{S,U}; supported in + * binutils >= 2.41. + */ +#define ERETS _ASM_BYTES(0xf2,0x0f,0x01,0xca) +#define ERETU _ASM_BYTES(0xf3,0x0f,0x01,0xca) + +/* + * RSP is aligned to a 64-byte boundary before used to push a new stack frame + */ +#define FRED_STACK_FRAME_RSP_MASK _AT(unsigned long, (~0x3f)) + +/* + * Used for the return address for call emulation during code patching, + * and measured in 64-byte cache lines. + */ +#define FRED_CONFIG_REDZONE_AMOUNT 1 +#define FRED_CONFIG_REDZONE (_AT(unsigned long, FRED_CONFIG_REDZONE_AMOUNT) << 6) +#define FRED_CONFIG_INT_STKLVL(l) (_AT(unsigned long, l) << 9) +#define FRED_CONFIG_ENTRYPOINT(p) _AT(unsigned long, (p)) + +#ifndef __ASSEMBLY__ + +#ifdef CONFIG_X86_FRED +#include + +#include + +struct fred_info { + /* Event data: CR2, DR6, ... */ + unsigned long edata; + unsigned long resv; +}; + +/* Full format of the FRED stack frame */ +struct fred_frame { + struct pt_regs regs; + struct fred_info info; +}; + +static __always_inline struct fred_info *fred_info(struct pt_regs *regs) +{ + return &container_of(regs, struct fred_frame, regs)->info; +} + +static __always_inline unsigned long fred_event_data(struct pt_regs *regs) +{ + return fred_info(regs)->edata; +} + +#else /* CONFIG_X86_FRED */ +static __always_inline unsigned long fred_event_data(struct pt_regs *regs) { return 0; } +#endif /* CONFIG_X86_FRED */ +#endif /* !__ASSEMBLY__ */ + +#endif /* ASM_X86_FRED_H */ From fcd06abf6de2b81724a1e39c121d288f66b1d392 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin (Intel)" Date: Tue, 5 Dec 2023 02:50:05 -0800 Subject: [PATCH 351/707] x86/fred: Reserve space for the FRED stack frame When using FRED, reserve space at the top of the stack frame, just like i386 does. Signed-off-by: H. Peter Anvin (Intel) Signed-off-by: Xin Li Signed-off-by: Thomas Gleixner Tested-by: Shan Kang Link: https://lore.kernel.org/r/20231205105030.8698-17-xin3.li@intel.com --- arch/x86/include/asm/thread_info.h | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index d63b02940747fa..12da7dfd5ef13b 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -31,7 +31,9 @@ * In vm86 mode, the hardware frame is much longer still, so add 16 * bytes to make room for the real-mode segments. * - * x86_64 has a fixed-length stack frame. + * x86-64 has a fixed-length stack frame, but it depends on whether + * or not FRED is enabled. Future versions of FRED might make this + * dynamic, but for now it is always 2 words longer. */ #ifdef CONFIG_X86_32 # ifdef CONFIG_VM86 @@ -39,8 +41,12 @@ # else # define TOP_OF_KERNEL_STACK_PADDING 8 # endif -#else -# define TOP_OF_KERNEL_STACK_PADDING 0 +#else /* x86-64 */ +# ifdef CONFIG_X86_FRED +# define TOP_OF_KERNEL_STACK_PADDING (2 * 8) +# else +# define TOP_OF_KERNEL_STACK_PADDING 0 +# endif #endif /* From f393835cbab6184f3ee6ed90499a88e9930a8512 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin (Intel)" Date: Tue, 5 Dec 2023 02:50:06 -0800 Subject: [PATCH 352/707] x86/fred: Update MSR_IA32_FRED_RSP0 during task switch MSR_IA32_FRED_RSP0 is used during ring 3 event delivery, and needs to be updated to point to the top of next task stack during task switch. Signed-off-by: H. Peter Anvin (Intel) Signed-off-by: Xin Li Signed-off-by: Thomas Gleixner Tested-by: Shan Kang Link: https://lore.kernel.org/r/20231205105030.8698-18-xin3.li@intel.com --- arch/x86/include/asm/switch_to.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index f42dbf17f52b0e..c3bd0c0758c9a4 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -70,9 +70,13 @@ static inline void update_task_stack(struct task_struct *task) #ifdef CONFIG_X86_32 this_cpu_write(cpu_tss_rw.x86_tss.sp1, task->thread.sp0); #else - /* Xen PV enters the kernel on the thread stack. */ - if (cpu_feature_enabled(X86_FEATURE_XENPV)) + if (cpu_feature_enabled(X86_FEATURE_FRED)) { + /* WRMSRNS is a baseline feature for FRED. */ + wrmsrns(MSR_IA32_FRED_RSP0, (unsigned long)task_stack_page(task) + THREAD_SIZE); + } else if (cpu_feature_enabled(X86_FEATURE_XENPV)) { + /* Xen PV enters the kernel on the thread stack. */ load_sp0(task_top_of_stack(task)); + } #endif } From 5710910a6c94bcb08d1081ca94119220066331ad Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin (Intel)" Date: Tue, 5 Dec 2023 02:50:07 -0800 Subject: [PATCH 353/707] x86/fred: Disallow the swapgs instruction when FRED is enabled SWAPGS is no longer needed thus NOT allowed with FRED because FRED transitions ensure that an operating system can _always_ operate with its own GS base address: - For events that occur in ring 3, FRED event delivery swaps the GS base address with the IA32_KERNEL_GS_BASE MSR. - ERETU (the FRED transition that returns to ring 3) also swaps the GS base address with the IA32_KERNEL_GS_BASE MSR. And the operating system can still setup the GS segment for a user thread without the need of loading a user thread GS with: - Using LKGS, available with FRED, to modify other attributes of the GS segment without compromising its ability always to operate with its own GS base address. - Accessing the GS segment base address for a user thread as before using RDMSR or WRMSR on the IA32_KERNEL_GS_BASE MSR. Note, LKGS loads the GS base address into the IA32_KERNEL_GS_BASE MSR instead of the GS segment's descriptor cache. As such, the operating system never changes its runtime GS base address. Signed-off-by: H. Peter Anvin (Intel) Signed-off-by: Xin Li Signed-off-by: Thomas Gleixner Tested-by: Shan Kang Link: https://lore.kernel.org/r/20231205105030.8698-19-xin3.li@intel.com --- arch/x86/kernel/process_64.c | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 0f78b58021bb2a..4f87f5987ae8c1 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -166,7 +166,29 @@ static noinstr unsigned long __rdgsbase_inactive(void) lockdep_assert_irqs_disabled(); - if (!cpu_feature_enabled(X86_FEATURE_XENPV)) { + /* + * SWAPGS is no longer needed thus NOT allowed with FRED because + * FRED transitions ensure that an operating system can _always_ + * operate with its own GS base address: + * - For events that occur in ring 3, FRED event delivery swaps + * the GS base address with the IA32_KERNEL_GS_BASE MSR. + * - ERETU (the FRED transition that returns to ring 3) also swaps + * the GS base address with the IA32_KERNEL_GS_BASE MSR. + * + * And the operating system can still setup the GS segment for a + * user thread without the need of loading a user thread GS with: + * - Using LKGS, available with FRED, to modify other attributes + * of the GS segment without compromising its ability always to + * operate with its own GS base address. + * - Accessing the GS segment base address for a user thread as + * before using RDMSR or WRMSR on the IA32_KERNEL_GS_BASE MSR. + * + * Note, LKGS loads the GS base address into the IA32_KERNEL_GS_BASE + * MSR instead of the GS segment’s descriptor cache. As such, the + * operating system never changes its runtime GS base address. + */ + if (!cpu_feature_enabled(X86_FEATURE_FRED) && + !cpu_feature_enabled(X86_FEATURE_XENPV)) { native_swapgs(); gsbase = rdgsbase(); native_swapgs(); @@ -191,7 +213,8 @@ static noinstr void __wrgsbase_inactive(unsigned long gsbase) { lockdep_assert_irqs_disabled(); - if (!cpu_feature_enabled(X86_FEATURE_XENPV)) { + if (!cpu_feature_enabled(X86_FEATURE_FRED) && + !cpu_feature_enabled(X86_FEATURE_XENPV)) { native_swapgs(); wrgsbase(gsbase); native_swapgs(); From d0fb796dc3475cf71d788ec960d8ed5de4d7a429 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin (Intel)" Date: Tue, 5 Dec 2023 02:50:08 -0800 Subject: [PATCH 354/707] x86/fred: No ESPFIX needed when FRED is enabled Because FRED always restores the full value of %rsp, ESPFIX is no longer needed when it's enabled. Signed-off-by: H. Peter Anvin (Intel) Signed-off-by: Xin Li Signed-off-by: Thomas Gleixner Tested-by: Shan Kang Link: https://lore.kernel.org/r/20231205105030.8698-20-xin3.li@intel.com --- arch/x86/kernel/espfix_64.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c index 16f9814c9be02c..6726e0473d0b40 100644 --- a/arch/x86/kernel/espfix_64.c +++ b/arch/x86/kernel/espfix_64.c @@ -106,6 +106,10 @@ void __init init_espfix_bsp(void) pgd_t *pgd; p4d_t *p4d; + /* FRED systems always restore the full value of %rsp */ + if (cpu_feature_enabled(X86_FEATURE_FRED)) + return; + /* Install the espfix pud into the kernel page directory */ pgd = &init_top_pgt[pgd_index(ESPFIX_BASE_ADDR)]; p4d = p4d_alloc(&init_mm, pgd, ESPFIX_BASE_ADDR); @@ -129,6 +133,10 @@ void init_espfix_ap(int cpu) void *stack_page; pteval_t ptemask; + /* FRED systems always restore the full value of %rsp */ + if (cpu_feature_enabled(X86_FEATURE_FRED)) + return; + /* We only have to do this once... */ if (likely(per_cpu(espfix_stack, cpu))) return; /* Already initialized */ From f102fe126d2811eded63d700fbe27527d936af74 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin (Intel)" Date: Tue, 5 Dec 2023 02:50:09 -0800 Subject: [PATCH 355/707] x86/fred: Allow single-step trap and NMI when starting a new task Entering a new task is logically speaking a return from a system call (exec, fork, clone, etc.). As such, if ptrace enables single stepping a single step exception should be allowed to trigger immediately upon entering user space. This is not optional. NMI should *never* be disabled in user space. As such, this is an optional, opportunistic way to catch errors. Allow single-step trap and NMI when starting a new task, thus once the new task enters user space, single-step trap and NMI are both enabled immediately. Signed-off-by: H. Peter Anvin (Intel) Signed-off-by: Xin Li Signed-off-by: Thomas Gleixner Tested-by: Shan Kang Link: https://lore.kernel.org/r/20231205105030.8698-21-xin3.li@intel.com --- arch/x86/kernel/process_64.c | 38 ++++++++++++++++++++++++++++++------ 1 file changed, 32 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 4f87f5987ae8c1..c075591b7b46a6 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -56,6 +56,7 @@ #include #include #include +#include #ifdef CONFIG_IA32_EMULATION /* Not included via unistd.h */ #include @@ -528,7 +529,7 @@ void x86_gsbase_write_task(struct task_struct *task, unsigned long gsbase) static void start_thread_common(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp, - unsigned int _cs, unsigned int _ss, unsigned int _ds) + u16 _cs, u16 _ss, u16 _ds) { WARN_ON_ONCE(regs != current_pt_regs()); @@ -545,11 +546,36 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip, loadsegment(ds, _ds); load_gs_index(0); - regs->ip = new_ip; - regs->sp = new_sp; - regs->cs = _cs; - regs->ss = _ss; - regs->flags = X86_EFLAGS_IF; + regs->ip = new_ip; + regs->sp = new_sp; + regs->csx = _cs; + regs->ssx = _ss; + /* + * Allow single-step trap and NMI when starting a new task, thus + * once the new task enters user space, single-step trap and NMI + * are both enabled immediately. + * + * Entering a new task is logically speaking a return from a + * system call (exec, fork, clone, etc.). As such, if ptrace + * enables single stepping a single step exception should be + * allowed to trigger immediately upon entering user space. + * This is not optional. + * + * NMI should *never* be disabled in user space. As such, this + * is an optional, opportunistic way to catch errors. + * + * Paranoia: High-order 48 bits above the lowest 16 bit SS are + * discarded by the legacy IRET instruction on all Intel, AMD, + * and Cyrix/Centaur/VIA CPUs, thus can be set unconditionally, + * even when FRED is not enabled. But we choose the safer side + * to use these bits only when FRED is enabled. + */ + if (cpu_feature_enabled(X86_FEATURE_FRED)) { + regs->fred_ss.swevent = true; + regs->fred_ss.nmi = true; + } + + regs->flags = X86_EFLAGS_IF | X86_EFLAGS_FIXED; } void From 9f6870bafc183644d20cba702168e37b48e291a7 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin (Intel)" Date: Tue, 5 Dec 2023 02:50:10 -0800 Subject: [PATCH 356/707] x86/fred: Make exc_page_fault() work for FRED On a FRED system, the faulting address (CR2) is passed on the stack, to avoid the problem of transient state. Thus the page fault address is read from the FRED stack frame instead of CR2 when FRED is enabled. Signed-off-by: H. Peter Anvin (Intel) Signed-off-by: Xin Li Signed-off-by: Thomas Gleixner Tested-by: Shan Kang Link: https://lore.kernel.org/r/20231205105030.8698-22-xin3.li@intel.com --- arch/x86/mm/fault.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 679b09cfe241c7..fa2d69951f25c1 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -34,6 +34,7 @@ #include /* kvm_handle_async_pf */ #include /* fixup_vdso_exception() */ #include +#include #define CREATE_TRACE_POINTS #include @@ -1518,8 +1519,10 @@ handle_page_fault(struct pt_regs *regs, unsigned long error_code, DEFINE_IDTENTRY_RAW_ERRORCODE(exc_page_fault) { - unsigned long address = read_cr2(); irqentry_state_t state; + unsigned long address; + + address = cpu_feature_enabled(X86_FEATURE_FRED) ? fred_event_data(regs) : read_cr2(); prefetchw(¤t->mm->mmap_lock); From 2ad2917c6f50c707fc9872f6885807e4133bd882 Mon Sep 17 00:00:00 2001 From: Xin Li Date: Tue, 5 Dec 2023 02:50:11 -0800 Subject: [PATCH 357/707] x86/idtentry: Incorporate definitions/declarations of the FRED entries FRED and IDT can share most of the definitions and declarations so that in the majority of cases the actual handler implementation is the same. The differences are the exceptions where FRED stores exception related information on the stack and the sysvec implementations as FRED can handle irqentry/exit() in the dispatcher instead of having it in each handler. Also add stub defines for vectors which are not used due to Kconfig decisions to spare the ifdeffery in the actual FRED dispatch code. Suggested-by: Thomas Gleixner Signed-off-by: Xin Li Signed-off-by: Thomas Gleixner Tested-by: Shan Kang Link: https://lore.kernel.org/r/20231205105030.8698-23-xin3.li@intel.com --- arch/x86/include/asm/idtentry.h | 71 +++++++++++++++++++++++++++++---- 1 file changed, 63 insertions(+), 8 deletions(-) diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index e9f71b3217c2d2..570f286ca7ddae 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -13,15 +13,18 @@ #include +typedef void (*idtentry_t)(struct pt_regs *regs); + /** * DECLARE_IDTENTRY - Declare functions for simple IDT entry points * No error code pushed by hardware * @vector: Vector number (ignored for C) * @func: Function name of the entry point * - * Declares three functions: + * Declares four functions: * - The ASM entry point: asm_##func * - The XEN PV trap entry point: xen_##func (maybe unused) + * - The C handler called from the FRED event dispatcher (maybe unused) * - The C handler called from the ASM entry point * * Note: This is the C variant of DECLARE_IDTENTRY(). As the name says it @@ -31,6 +34,7 @@ #define DECLARE_IDTENTRY(vector, func) \ asmlinkage void asm_##func(void); \ asmlinkage void xen_asm_##func(void); \ + void fred_##func(struct pt_regs *regs); \ __visible void func(struct pt_regs *regs) /** @@ -137,6 +141,17 @@ static __always_inline void __##func(struct pt_regs *regs, \ #define DEFINE_IDTENTRY_RAW(func) \ __visible noinstr void func(struct pt_regs *regs) +/** + * DEFINE_FREDENTRY_RAW - Emit code for raw FRED entry points + * @func: Function name of the entry point + * + * @func is called from the FRED event dispatcher with interrupts disabled. + * + * See @DEFINE_IDTENTRY_RAW for further details. + */ +#define DEFINE_FREDENTRY_RAW(func) \ +noinstr void fred_##func(struct pt_regs *regs) + /** * DECLARE_IDTENTRY_RAW_ERRORCODE - Declare functions for raw IDT entry points * Error code pushed by hardware @@ -233,17 +248,27 @@ static noinline void __##func(struct pt_regs *regs, u32 vector) #define DEFINE_IDTENTRY_SYSVEC(func) \ static void __##func(struct pt_regs *regs); \ \ +static __always_inline void instr_##func(struct pt_regs *regs) \ +{ \ + kvm_set_cpu_l1tf_flush_l1d(); \ + run_sysvec_on_irqstack_cond(__##func, regs); \ +} \ + \ __visible noinstr void func(struct pt_regs *regs) \ { \ irqentry_state_t state = irqentry_enter(regs); \ \ instrumentation_begin(); \ - kvm_set_cpu_l1tf_flush_l1d(); \ - run_sysvec_on_irqstack_cond(__##func, regs); \ + instr_##func (regs); \ instrumentation_end(); \ irqentry_exit(regs, state); \ } \ \ +void fred_##func(struct pt_regs *regs) \ +{ \ + instr_##func (regs); \ +} \ + \ static noinline void __##func(struct pt_regs *regs) /** @@ -260,19 +285,29 @@ static noinline void __##func(struct pt_regs *regs) #define DEFINE_IDTENTRY_SYSVEC_SIMPLE(func) \ static __always_inline void __##func(struct pt_regs *regs); \ \ -__visible noinstr void func(struct pt_regs *regs) \ +static __always_inline void instr_##func(struct pt_regs *regs) \ { \ - irqentry_state_t state = irqentry_enter(regs); \ - \ - instrumentation_begin(); \ __irq_enter_raw(); \ kvm_set_cpu_l1tf_flush_l1d(); \ __##func (regs); \ __irq_exit_raw(); \ +} \ + \ +__visible noinstr void func(struct pt_regs *regs) \ +{ \ + irqentry_state_t state = irqentry_enter(regs); \ + \ + instrumentation_begin(); \ + instr_##func (regs); \ instrumentation_end(); \ irqentry_exit(regs, state); \ } \ \ +void fred_##func(struct pt_regs *regs) \ +{ \ + instr_##func (regs); \ +} \ + \ static __always_inline void __##func(struct pt_regs *regs) /** @@ -410,15 +445,18 @@ __visible noinstr void func(struct pt_regs *regs, \ /* C-Code mapping */ #define DECLARE_IDTENTRY_NMI DECLARE_IDTENTRY_RAW #define DEFINE_IDTENTRY_NMI DEFINE_IDTENTRY_RAW +#define DEFINE_FREDENTRY_NMI DEFINE_FREDENTRY_RAW #ifdef CONFIG_X86_64 #define DECLARE_IDTENTRY_MCE DECLARE_IDTENTRY_IST #define DEFINE_IDTENTRY_MCE DEFINE_IDTENTRY_IST #define DEFINE_IDTENTRY_MCE_USER DEFINE_IDTENTRY_NOIST +#define DEFINE_FREDENTRY_MCE DEFINE_FREDENTRY_RAW #define DECLARE_IDTENTRY_DEBUG DECLARE_IDTENTRY_IST #define DEFINE_IDTENTRY_DEBUG DEFINE_IDTENTRY_IST #define DEFINE_IDTENTRY_DEBUG_USER DEFINE_IDTENTRY_NOIST +#define DEFINE_FREDENTRY_DEBUG DEFINE_FREDENTRY_RAW #endif #else /* !__ASSEMBLY__ */ @@ -655,23 +693,36 @@ DECLARE_IDTENTRY(RESCHEDULE_VECTOR, sysvec_reschedule_ipi); DECLARE_IDTENTRY_SYSVEC(REBOOT_VECTOR, sysvec_reboot); DECLARE_IDTENTRY_SYSVEC(CALL_FUNCTION_SINGLE_VECTOR, sysvec_call_function_single); DECLARE_IDTENTRY_SYSVEC(CALL_FUNCTION_VECTOR, sysvec_call_function); +#else +# define fred_sysvec_reschedule_ipi NULL +# define fred_sysvec_reboot NULL +# define fred_sysvec_call_function_single NULL +# define fred_sysvec_call_function NULL #endif #ifdef CONFIG_X86_LOCAL_APIC # ifdef CONFIG_X86_MCE_THRESHOLD DECLARE_IDTENTRY_SYSVEC(THRESHOLD_APIC_VECTOR, sysvec_threshold); +# else +# define fred_sysvec_threshold NULL # endif # ifdef CONFIG_X86_MCE_AMD DECLARE_IDTENTRY_SYSVEC(DEFERRED_ERROR_VECTOR, sysvec_deferred_error); +# else +# define fred_sysvec_deferred_error NULL # endif # ifdef CONFIG_X86_THERMAL_VECTOR DECLARE_IDTENTRY_SYSVEC(THERMAL_APIC_VECTOR, sysvec_thermal); +# else +# define fred_sysvec_thermal NULL # endif # ifdef CONFIG_IRQ_WORK DECLARE_IDTENTRY_SYSVEC(IRQ_WORK_VECTOR, sysvec_irq_work); +# else +# define fred_sysvec_irq_work NULL # endif #endif @@ -679,12 +730,16 @@ DECLARE_IDTENTRY_SYSVEC(IRQ_WORK_VECTOR, sysvec_irq_work); DECLARE_IDTENTRY_SYSVEC(POSTED_INTR_VECTOR, sysvec_kvm_posted_intr_ipi); DECLARE_IDTENTRY_SYSVEC(POSTED_INTR_WAKEUP_VECTOR, sysvec_kvm_posted_intr_wakeup_ipi); DECLARE_IDTENTRY_SYSVEC(POSTED_INTR_NESTED_VECTOR, sysvec_kvm_posted_intr_nested_ipi); +#else +# define fred_sysvec_kvm_posted_intr_ipi NULL +# define fred_sysvec_kvm_posted_intr_wakeup_ipi NULL +# define fred_sysvec_kvm_posted_intr_nested_ipi NULL #endif #if IS_ENABLED(CONFIG_HYPERV) DECLARE_IDTENTRY_SYSVEC(HYPERVISOR_CALLBACK_VECTOR, sysvec_hyperv_callback); DECLARE_IDTENTRY_SYSVEC(HYPERV_REENLIGHTENMENT_VECTOR, sysvec_hyperv_reenlightenment); -DECLARE_IDTENTRY_SYSVEC(HYPERV_STIMER0_VECTOR, sysvec_hyperv_stimer0); +DECLARE_IDTENTRY_SYSVEC(HYPERV_STIMER0_VECTOR, sysvec_hyperv_stimer0); #endif #if IS_ENABLED(CONFIG_ACRN_GUEST) From 4af12f6a393ca2be76de6c5484f79acc1167e1c8 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin (Intel)" Date: Tue, 5 Dec 2023 02:50:12 -0800 Subject: [PATCH 358/707] x86/fred: Add a debug fault entry stub for FRED When occurred on different ring level, i.e., from user or kernel context, #DB needs to be handled on different stack: User #DB on current task stack, while kernel #DB on a dedicated stack. This is exactly how FRED event delivery invokes an exception handler: ring 3 event on level 0 stack, i.e., current task stack; ring 0 event on the #DB dedicated stack specified in the IA32_FRED_STKLVLS MSR. So unlike IDT, the FRED debug exception entry stub doesn't do stack switch. On a FRED system, the debug trap status information (DR6) is passed on the stack, to avoid the problem of transient state. Furthermore, FRED transitions avoid a lot of ugly corner cases the handling of which can, and should be, skipped. The FRED debug trap status information saved on the stack differs from DR6 in both stickiness and polarity; it is exactly in the format which debug_read_clear_dr6() returns for the IDT entry points. Signed-off-by: H. Peter Anvin (Intel) Signed-off-by: Xin Li Signed-off-by: Thomas Gleixner Tested-by: Shan Kang Link: https://lore.kernel.org/r/20231205105030.8698-24-xin3.li@intel.com --- arch/x86/kernel/traps.c | 43 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 38 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 3c37489018256f..1b19a170f5e35e 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -51,6 +51,7 @@ #include #include #include +#include #include #include #include @@ -935,8 +936,7 @@ static bool notify_debug(struct pt_regs *regs, unsigned long *dr6) return false; } -static __always_inline void exc_debug_kernel(struct pt_regs *regs, - unsigned long dr6) +static noinstr void exc_debug_kernel(struct pt_regs *regs, unsigned long dr6) { /* * Disable breakpoints during exception handling; recursive exceptions @@ -948,6 +948,11 @@ static __always_inline void exc_debug_kernel(struct pt_regs *regs, * * Entry text is excluded for HW_BP_X and cpu_entry_area, which * includes the entry stack is excluded for everything. + * + * For FRED, nested #DB should just work fine. But when a watchpoint or + * breakpoint is set in the code path which is executed by #DB handler, + * it results in an endless recursion and stack overflow. Thus we stay + * with the IDT approach, i.e., save DR7 and disable #DB. */ unsigned long dr7 = local_db_save(); irqentry_state_t irq_state = irqentry_nmi_enter(regs); @@ -977,7 +982,8 @@ static __always_inline void exc_debug_kernel(struct pt_regs *regs, * Catch SYSENTER with TF set and clear DR_STEP. If this hit a * watchpoint at the same time then that will still be handled. */ - if ((dr6 & DR_STEP) && is_sysenter_singlestep(regs)) + if (!cpu_feature_enabled(X86_FEATURE_FRED) && + (dr6 & DR_STEP) && is_sysenter_singlestep(regs)) dr6 &= ~DR_STEP; /* @@ -1009,8 +1015,7 @@ static __always_inline void exc_debug_kernel(struct pt_regs *regs, local_db_restore(dr7); } -static __always_inline void exc_debug_user(struct pt_regs *regs, - unsigned long dr6) +static noinstr void exc_debug_user(struct pt_regs *regs, unsigned long dr6) { bool icebp; @@ -1094,6 +1099,34 @@ DEFINE_IDTENTRY_DEBUG_USER(exc_debug) { exc_debug_user(regs, debug_read_clear_dr6()); } + +#ifdef CONFIG_X86_FRED +/* + * When occurred on different ring level, i.e., from user or kernel + * context, #DB needs to be handled on different stack: User #DB on + * current task stack, while kernel #DB on a dedicated stack. + * + * This is exactly how FRED event delivery invokes an exception + * handler: ring 3 event on level 0 stack, i.e., current task stack; + * ring 0 event on the #DB dedicated stack specified in the + * IA32_FRED_STKLVLS MSR. So unlike IDT, the FRED debug exception + * entry stub doesn't do stack switch. + */ +DEFINE_FREDENTRY_DEBUG(exc_debug) +{ + /* + * FRED #DB stores DR6 on the stack in the format which + * debug_read_clear_dr6() returns for the IDT entry points. + */ + unsigned long dr6 = fred_event_data(regs); + + if (user_mode(regs)) + exc_debug_user(regs, dr6); + else + exc_debug_kernel(regs, dr6); +} +#endif /* CONFIG_X86_FRED */ + #else /* 32 bit does not have separate entry points. */ DEFINE_IDTENTRY_RAW(exc_debug) From 3e91abaa567300fd48a0fac4c9aaedd30fa2f3f9 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin (Intel)" Date: Fri, 15 Dec 2023 22:31:39 -0800 Subject: [PATCH 359/707] x86/fred: Add a NMI entry stub for FRED On a FRED system, NMIs nest both with themselves and faults, transient information is saved into the stack frame, and NMI unblocking only happens when the stack frame indicates that so should happen. Thus, the NMI entry stub for FRED is really quite small... Signed-off-by: H. Peter Anvin (Intel) Signed-off-by: Xin Li Signed-off-by: Thomas Gleixner Tested-by: Shan Kang Link: https://lore.kernel.org/r/20231216063139.25567-1-xin3.li@intel.com --- arch/x86/kernel/nmi.c | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 17e955ab69feda..3130a66b0f48d0 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -35,6 +35,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include @@ -651,6 +652,47 @@ void nmi_backtrace_stall_check(const struct cpumask *btp) #endif +#ifdef CONFIG_X86_FRED +/* + * With FRED, CR2/DR6 is pushed to #PF/#DB stack frame during FRED + * event delivery, i.e., there is no problem of transient states. + * And NMI unblocking only happens when the stack frame indicates + * that so should happen. + * + * Thus, the NMI entry stub for FRED is really straightforward and + * as simple as most exception handlers. As such, #DB is allowed + * during NMI handling. + */ +DEFINE_FREDENTRY_NMI(exc_nmi) +{ + irqentry_state_t irq_state; + + if (arch_cpu_is_offline(smp_processor_id())) { + if (microcode_nmi_handler_enabled()) + microcode_offline_nmi_handler(); + return; + } + + /* + * Save CR2 for eventual restore to cover the case where the NMI + * hits the VMENTER/VMEXIT region where guest CR2 is life. This + * prevents guest state corruption in case that the NMI handler + * takes a page fault. + */ + this_cpu_write(nmi_cr2, read_cr2()); + + irq_state = irqentry_nmi_enter(regs); + + inc_irq_stat(__nmi_count); + default_do_nmi(regs); + + irqentry_nmi_exit(regs, irq_state); + + if (unlikely(this_cpu_read(nmi_cr2) != read_cr2())) + write_cr2(this_cpu_read(nmi_cr2)); +} +#endif + void stop_nmi(void) { ignore_nmis++; From 5dd56c94ca2f8834e7689cac0045d312ef3ac9c6 Mon Sep 17 00:00:00 2001 From: Xin Li Date: Tue, 5 Dec 2023 02:50:14 -0800 Subject: [PATCH 360/707] x86/fred: Add a machine check entry stub for FRED Like #DB, when occurred on different ring level, i.e., from user or kernel context, #MCE needs to be handled on different stack: User #MCE on current task stack, while kernel #MCE on a dedicated stack. This is exactly how FRED event delivery invokes an exception handler: ring 3 event on level 0 stack, i.e., current task stack; ring 0 event on the #MCE dedicated stack specified in the IA32_FRED_STKLVLS MSR. So unlike IDT, the FRED machine check entry stub doesn't do stack switch. Signed-off-by: Xin Li Signed-off-by: Thomas Gleixner Tested-by: Shan Kang Link: https://lore.kernel.org/r/20231205105030.8698-26-xin3.li@intel.com --- arch/x86/kernel/cpu/mce/core.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index bc39252bc54f2e..04acdc3534c81a 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -46,6 +46,7 @@ #include #include +#include #include #include #include @@ -2166,6 +2167,31 @@ DEFINE_IDTENTRY_MCE_USER(exc_machine_check) exc_machine_check_user(regs); local_db_restore(dr7); } + +#ifdef CONFIG_X86_FRED +/* + * When occurred on different ring level, i.e., from user or kernel + * context, #MCE needs to be handled on different stack: User #MCE + * on current task stack, while kernel #MCE on a dedicated stack. + * + * This is exactly how FRED event delivery invokes an exception + * handler: ring 3 event on level 0 stack, i.e., current task stack; + * ring 0 event on the #MCE dedicated stack specified in the + * IA32_FRED_STKLVLS MSR. So unlike IDT, the FRED machine check entry + * stub doesn't do stack switch. + */ +DEFINE_FREDENTRY_MCE(exc_machine_check) +{ + unsigned long dr7; + + dr7 = local_db_save(); + if (user_mode(regs)) + exc_machine_check_user(regs); + else + exc_machine_check_kernel(regs); + local_db_restore(dr7); +} +#endif #else /* 32bit unified entry point */ DEFINE_IDTENTRY_RAW(exc_machine_check) From 6786137bf8fd717bed7ff9ce4eee34ce03a26631 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin (Intel)" Date: Sat, 9 Dec 2023 13:42:14 -0800 Subject: [PATCH 361/707] x86/fred: FRED entry/exit and dispatch code The code to actually handle kernel and event entry/exit using FRED. It is split up into two files thus: - entry_64_fred.S contains the actual entrypoints and exit code, and saves and restores registers. - entry_fred.c contains the two-level event dispatch code for FRED. The first-level dispatch is on the event type, and the second-level is on the event vector. [ bp: Fold in an allmodconfig clang build fix: https://lore.kernel.org/r/20240129064521.5168-1-xin3.li@intel.com and a CONFIG_IA32_EMULATION=n build fix: https://lore.kernel.org/r/20240127093728.1323-3-xin3.li@intel.com] Suggested-by: Thomas Gleixner Originally-by: Megha Dey Co-developed-by: Xin Li Signed-off-by: H. Peter Anvin (Intel) Signed-off-by: Xin Li Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov (AMD) Tested-by: Shan Kang Link: https://lore.kernel.org/r/20231209214214.2932-1-xin3.li@intel.com --- arch/x86/entry/Makefile | 5 +- arch/x86/entry/entry_64_fred.S | 50 ++++++ arch/x86/entry/entry_fred.c | 245 ++++++++++++++++++++++++++ arch/x86/include/asm/asm-prototypes.h | 1 + arch/x86/include/asm/fred.h | 6 + arch/x86/include/asm/ia32.h | 4 +- 6 files changed, 308 insertions(+), 3 deletions(-) create mode 100644 arch/x86/entry/entry_64_fred.S create mode 100644 arch/x86/entry/entry_fred.c diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile index ca2fe186994b0a..c93e7f5c2a0652 100644 --- a/arch/x86/entry/Makefile +++ b/arch/x86/entry/Makefile @@ -18,6 +18,9 @@ obj-y += vdso/ obj-y += vsyscall/ obj-$(CONFIG_PREEMPTION) += thunk_$(BITS).o +CFLAGS_entry_fred.o += -fno-stack-protector +CFLAGS_REMOVE_entry_fred.o += -pg $(CC_FLAGS_FTRACE) +obj-$(CONFIG_X86_FRED) += entry_64_fred.o entry_fred.o + obj-$(CONFIG_IA32_EMULATION) += entry_64_compat.o syscall_32.o obj-$(CONFIG_X86_X32_ABI) += syscall_x32.o - diff --git a/arch/x86/entry/entry_64_fred.S b/arch/x86/entry/entry_64_fred.S new file mode 100644 index 00000000000000..c1ddaf6b068f4d --- /dev/null +++ b/arch/x86/entry/entry_64_fred.S @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * The actual FRED entry points. + */ + +#include + +#include "calling.h" + + .code64 + .section .noinstr.text, "ax" + +.macro FRED_ENTER + UNWIND_HINT_END_OF_STACK + ENDBR + PUSH_AND_CLEAR_REGS + movq %rsp, %rdi /* %rdi -> pt_regs */ +.endm + +.macro FRED_EXIT + UNWIND_HINT_REGS + POP_REGS +.endm + +/* + * The new RIP value that FRED event delivery establishes is + * IA32_FRED_CONFIG & ~FFFH for events that occur in ring 3. + * Thus the FRED ring 3 entry point must be 4K page aligned. + */ + .align 4096 + +SYM_CODE_START_NOALIGN(asm_fred_entrypoint_user) + FRED_ENTER + call fred_entry_from_user + FRED_EXIT + ERETU +SYM_CODE_END(asm_fred_entrypoint_user) + +/* + * The new RIP value that FRED event delivery establishes is + * (IA32_FRED_CONFIG & ~FFFH) + 256 for events that occur in + * ring 0, i.e., asm_fred_entrypoint_user + 256. + */ + .org asm_fred_entrypoint_user + 256, 0xcc +SYM_CODE_START_NOALIGN(asm_fred_entrypoint_kernel) + FRED_ENTER + call fred_entry_from_kernel + FRED_EXIT + ERETS +SYM_CODE_END(asm_fred_entrypoint_kernel) diff --git a/arch/x86/entry/entry_fred.c b/arch/x86/entry/entry_fred.c new file mode 100644 index 00000000000000..125b62311b31f9 --- /dev/null +++ b/arch/x86/entry/entry_fred.c @@ -0,0 +1,245 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * The FRED specific kernel/user entry functions which are invoked from + * assembly code and dispatch to the associated handlers. + */ +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +/* FRED EVENT_TYPE_OTHER vector numbers */ +#define FRED_SYSCALL 1 +#define FRED_SYSENTER 2 + +static noinstr void fred_bad_type(struct pt_regs *regs, unsigned long error_code) +{ + irqentry_state_t irq_state = irqentry_nmi_enter(regs); + + instrumentation_begin(); + + /* Panic on events from a high stack level */ + if (regs->fred_cs.sl > 0) { + pr_emerg("PANIC: invalid or fatal FRED event; event type %u " + "vector %u error 0x%lx aux 0x%lx at %04x:%016lx\n", + regs->fred_ss.type, regs->fred_ss.vector, regs->orig_ax, + fred_event_data(regs), regs->cs, regs->ip); + die("invalid or fatal FRED event", regs, regs->orig_ax); + panic("invalid or fatal FRED event"); + } else { + unsigned long flags = oops_begin(); + int sig = SIGKILL; + + pr_alert("BUG: invalid or fatal FRED event; event type %u " + "vector %u error 0x%lx aux 0x%lx at %04x:%016lx\n", + regs->fred_ss.type, regs->fred_ss.vector, regs->orig_ax, + fred_event_data(regs), regs->cs, regs->ip); + + if (__die("Invalid or fatal FRED event", regs, regs->orig_ax)) + sig = 0; + + oops_end(flags, regs, sig); + } + + instrumentation_end(); + irqentry_nmi_exit(regs, irq_state); +} + +static noinstr void fred_intx(struct pt_regs *regs) +{ + switch (regs->fred_ss.vector) { + /* Opcode 0xcd, 0x3, NOT INT3 (opcode 0xcc) */ + case X86_TRAP_BP: + return exc_int3(regs); + + /* Opcode 0xcd, 0x4, NOT INTO (opcode 0xce) */ + case X86_TRAP_OF: + return exc_overflow(regs); + +#ifdef CONFIG_IA32_EMULATION + /* INT80 */ + case IA32_SYSCALL_VECTOR: + if (ia32_enabled()) + return int80_emulation(regs); + fallthrough; +#endif + + default: + return exc_general_protection(regs, 0); + } +} + +static __always_inline void fred_other(struct pt_regs *regs) +{ + /* The compiler can fold these conditions into a single test */ + if (likely(regs->fred_ss.vector == FRED_SYSCALL && regs->fred_ss.lm)) { + regs->orig_ax = regs->ax; + regs->ax = -ENOSYS; + do_syscall_64(regs, regs->orig_ax); + return; + } else if (ia32_enabled() && + likely(regs->fred_ss.vector == FRED_SYSENTER && !regs->fred_ss.lm)) { + regs->orig_ax = regs->ax; + regs->ax = -ENOSYS; + do_fast_syscall_32(regs); + return; + } else { + exc_invalid_op(regs); + return; + } +} + +#define SYSVEC(_vector, _function) [_vector - FIRST_SYSTEM_VECTOR] = fred_sysvec_##_function + +static idtentry_t sysvec_table[NR_SYSTEM_VECTORS] __ro_after_init = { + SYSVEC(ERROR_APIC_VECTOR, error_interrupt), + SYSVEC(SPURIOUS_APIC_VECTOR, spurious_apic_interrupt), + SYSVEC(LOCAL_TIMER_VECTOR, apic_timer_interrupt), + SYSVEC(X86_PLATFORM_IPI_VECTOR, x86_platform_ipi), + + SYSVEC(RESCHEDULE_VECTOR, reschedule_ipi), + SYSVEC(CALL_FUNCTION_SINGLE_VECTOR, call_function_single), + SYSVEC(CALL_FUNCTION_VECTOR, call_function), + SYSVEC(REBOOT_VECTOR, reboot), + + SYSVEC(THRESHOLD_APIC_VECTOR, threshold), + SYSVEC(DEFERRED_ERROR_VECTOR, deferred_error), + SYSVEC(THERMAL_APIC_VECTOR, thermal), + + SYSVEC(IRQ_WORK_VECTOR, irq_work), + + SYSVEC(POSTED_INTR_VECTOR, kvm_posted_intr_ipi), + SYSVEC(POSTED_INTR_WAKEUP_VECTOR, kvm_posted_intr_wakeup_ipi), + SYSVEC(POSTED_INTR_NESTED_VECTOR, kvm_posted_intr_nested_ipi), +}; + +static noinstr void fred_extint(struct pt_regs *regs) +{ + unsigned int vector = regs->fred_ss.vector; + unsigned int index = array_index_nospec(vector - FIRST_SYSTEM_VECTOR, + NR_SYSTEM_VECTORS); + + if (WARN_ON_ONCE(vector < FIRST_EXTERNAL_VECTOR)) + return; + + if (likely(vector >= FIRST_SYSTEM_VECTOR)) { + irqentry_state_t state = irqentry_enter(regs); + + instrumentation_begin(); + sysvec_table[index](regs); + instrumentation_end(); + irqentry_exit(regs, state); + } else { + common_interrupt(regs, vector); + } +} + +static noinstr void fred_hwexc(struct pt_regs *regs, unsigned long error_code) +{ + /* Optimize for #PF. That's the only exception which matters performance wise */ + if (likely(regs->fred_ss.vector == X86_TRAP_PF)) + return exc_page_fault(regs, error_code); + + switch (regs->fred_ss.vector) { + case X86_TRAP_DE: return exc_divide_error(regs); + case X86_TRAP_DB: return fred_exc_debug(regs); + case X86_TRAP_BR: return exc_bounds(regs); + case X86_TRAP_UD: return exc_invalid_op(regs); + case X86_TRAP_NM: return exc_device_not_available(regs); + case X86_TRAP_DF: return exc_double_fault(regs, error_code); + case X86_TRAP_TS: return exc_invalid_tss(regs, error_code); + case X86_TRAP_NP: return exc_segment_not_present(regs, error_code); + case X86_TRAP_SS: return exc_stack_segment(regs, error_code); + case X86_TRAP_GP: return exc_general_protection(regs, error_code); + case X86_TRAP_MF: return exc_coprocessor_error(regs); + case X86_TRAP_AC: return exc_alignment_check(regs, error_code); + case X86_TRAP_XF: return exc_simd_coprocessor_error(regs); + +#ifdef CONFIG_X86_MCE + case X86_TRAP_MC: return fred_exc_machine_check(regs); +#endif +#ifdef CONFIG_INTEL_TDX_GUEST + case X86_TRAP_VE: return exc_virtualization_exception(regs); +#endif +#ifdef CONFIG_X86_CET + case X86_TRAP_CP: return exc_control_protection(regs, error_code); +#endif + default: return fred_bad_type(regs, error_code); + } + +} + +static noinstr void fred_swexc(struct pt_regs *regs, unsigned long error_code) +{ + switch (regs->fred_ss.vector) { + case X86_TRAP_BP: return exc_int3(regs); + case X86_TRAP_OF: return exc_overflow(regs); + default: return fred_bad_type(regs, error_code); + } +} + +__visible noinstr void fred_entry_from_user(struct pt_regs *regs) +{ + unsigned long error_code = regs->orig_ax; + + /* Invalidate orig_ax so that syscall_get_nr() works correctly */ + regs->orig_ax = -1; + + switch (regs->fred_ss.type) { + case EVENT_TYPE_EXTINT: + return fred_extint(regs); + case EVENT_TYPE_NMI: + if (likely(regs->fred_ss.vector == X86_TRAP_NMI)) + return fred_exc_nmi(regs); + break; + case EVENT_TYPE_HWEXC: + return fred_hwexc(regs, error_code); + case EVENT_TYPE_SWINT: + return fred_intx(regs); + case EVENT_TYPE_PRIV_SWEXC: + if (likely(regs->fred_ss.vector == X86_TRAP_DB)) + return fred_exc_debug(regs); + break; + case EVENT_TYPE_SWEXC: + return fred_swexc(regs, error_code); + case EVENT_TYPE_OTHER: + return fred_other(regs); + default: break; + } + + return fred_bad_type(regs, error_code); +} + +__visible noinstr void fred_entry_from_kernel(struct pt_regs *regs) +{ + unsigned long error_code = regs->orig_ax; + + /* Invalidate orig_ax so that syscall_get_nr() works correctly */ + regs->orig_ax = -1; + + switch (regs->fred_ss.type) { + case EVENT_TYPE_EXTINT: + return fred_extint(regs); + case EVENT_TYPE_NMI: + if (likely(regs->fred_ss.vector == X86_TRAP_NMI)) + return fred_exc_nmi(regs); + break; + case EVENT_TYPE_HWEXC: + return fred_hwexc(regs, error_code); + case EVENT_TYPE_PRIV_SWEXC: + if (likely(regs->fred_ss.vector == X86_TRAP_DB)) + return fred_exc_debug(regs); + break; + case EVENT_TYPE_SWEXC: + return fred_swexc(regs, error_code); + default: break; + } + + return fred_bad_type(regs, error_code); +} diff --git a/arch/x86/include/asm/asm-prototypes.h b/arch/x86/include/asm/asm-prototypes.h index b1a98fa38828e2..076bf8dee70264 100644 --- a/arch/x86/include/asm/asm-prototypes.h +++ b/arch/x86/include/asm/asm-prototypes.h @@ -12,6 +12,7 @@ #include #include #include +#include #include #ifndef CONFIG_X86_CMPXCHG64 diff --git a/arch/x86/include/asm/fred.h b/arch/x86/include/asm/fred.h index f514fdb5a39f73..16a64ffecbf8d5 100644 --- a/arch/x86/include/asm/fred.h +++ b/arch/x86/include/asm/fred.h @@ -60,6 +60,12 @@ static __always_inline unsigned long fred_event_data(struct pt_regs *regs) return fred_info(regs)->edata; } +void asm_fred_entrypoint_user(void); +void asm_fred_entrypoint_kernel(void); + +__visible void fred_entry_from_user(struct pt_regs *regs); +__visible void fred_entry_from_kernel(struct pt_regs *regs); + #else /* CONFIG_X86_FRED */ static __always_inline unsigned long fred_event_data(struct pt_regs *regs) { return 0; } #endif /* CONFIG_X86_FRED */ diff --git a/arch/x86/include/asm/ia32.h b/arch/x86/include/asm/ia32.h index c7ef6ea2fa993c..4212c00c9708d4 100644 --- a/arch/x86/include/asm/ia32.h +++ b/arch/x86/include/asm/ia32.h @@ -69,7 +69,7 @@ extern void ia32_pick_mmap_layout(struct mm_struct *mm); extern bool __ia32_enabled; -static inline bool ia32_enabled(void) +static __always_inline bool ia32_enabled(void) { return __ia32_enabled; } @@ -81,7 +81,7 @@ static inline void ia32_disable(void) #else /* !CONFIG_IA32_EMULATION */ -static inline bool ia32_enabled(void) +static __always_inline bool ia32_enabled(void) { return IS_ENABLED(CONFIG_X86_32); } From db7c787d8ba268a8d8beabb0027715246375c6e0 Mon Sep 17 00:00:00 2001 From: Xin Li Date: Tue, 5 Dec 2023 02:50:16 -0800 Subject: [PATCH 362/707] x86/traps: Add sysvec_install() to install a system interrupt handler Add sysvec_install() to install a system interrupt handler into the IDT or the FRED system interrupt handler table. Signed-off-by: Xin Li Signed-off-by: Thomas Gleixner Tested-by: Shan Kang Link: https://lore.kernel.org/r/20231205105030.8698-28-xin3.li@intel.com --- arch/x86/entry/entry_fred.c | 14 ++++++++++++++ arch/x86/include/asm/desc.h | 2 -- arch/x86/include/asm/idtentry.h | 15 +++++++++++++++ arch/x86/kernel/cpu/acrn.c | 4 ++-- arch/x86/kernel/cpu/mshyperv.c | 15 +++++++-------- arch/x86/kernel/idt.c | 4 ++-- arch/x86/kernel/kvm.c | 2 +- drivers/xen/events/events_base.c | 2 +- 8 files changed, 42 insertions(+), 16 deletions(-) diff --git a/arch/x86/entry/entry_fred.c b/arch/x86/entry/entry_fred.c index 125b62311b31f9..3be0269bc0d465 100644 --- a/arch/x86/entry/entry_fred.c +++ b/arch/x86/entry/entry_fred.c @@ -119,6 +119,20 @@ static idtentry_t sysvec_table[NR_SYSTEM_VECTORS] __ro_after_init = { SYSVEC(POSTED_INTR_NESTED_VECTOR, kvm_posted_intr_nested_ipi), }; +static bool fred_setup_done __initdata; + +void __init fred_install_sysvec(unsigned int sysvec, idtentry_t handler) +{ + if (WARN_ON_ONCE(sysvec < FIRST_SYSTEM_VECTOR)) + return; + + if (WARN_ON_ONCE(fred_setup_done)) + return; + + if (!WARN_ON_ONCE(sysvec_table[sysvec - FIRST_SYSTEM_VECTOR])) + sysvec_table[sysvec - FIRST_SYSTEM_VECTOR] = handler; +} + static noinstr void fred_extint(struct pt_regs *regs) { unsigned int vector = regs->fred_ss.vector; diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index ab97b22ac04a26..ec95fe44fa3a03 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -402,8 +402,6 @@ static inline void set_desc_limit(struct desc_struct *desc, unsigned long limit) desc->limit1 = (limit >> 16) & 0xf; } -void alloc_intr_gate(unsigned int n, const void *addr); - static inline void init_idt_data(struct idt_data *data, unsigned int n, const void *addr) { diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index 570f286ca7ddae..47d4c04d103df4 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -459,6 +459,21 @@ __visible noinstr void func(struct pt_regs *regs, \ #define DEFINE_FREDENTRY_DEBUG DEFINE_FREDENTRY_RAW #endif +void idt_install_sysvec(unsigned int n, const void *function); + +#ifdef CONFIG_X86_FRED +void fred_install_sysvec(unsigned int vector, const idtentry_t function); +#else +static inline void fred_install_sysvec(unsigned int vector, const idtentry_t function) { } +#endif + +#define sysvec_install(vector, function) { \ + if (cpu_feature_enabled(X86_FEATURE_FRED)) \ + fred_install_sysvec(vector, function); \ + else \ + idt_install_sysvec(vector, asm_##function); \ +} + #else /* !__ASSEMBLY__ */ /* diff --git a/arch/x86/kernel/cpu/acrn.c b/arch/x86/kernel/cpu/acrn.c index bfeb18fad63f15..2c5b51aad91a03 100644 --- a/arch/x86/kernel/cpu/acrn.c +++ b/arch/x86/kernel/cpu/acrn.c @@ -26,8 +26,8 @@ static u32 __init acrn_detect(void) static void __init acrn_init_platform(void) { - /* Setup the IDT for ACRN hypervisor callback */ - alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, asm_sysvec_acrn_hv_callback); + /* Install system interrupt handler for ACRN hypervisor callback */ + sysvec_install(HYPERVISOR_CALLBACK_VECTOR, sysvec_acrn_hv_callback); x86_platform.calibrate_tsc = acrn_get_tsc_khz; x86_platform.calibrate_cpu = acrn_get_tsc_khz; diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 01fa06dd06b66c..45e0e70e238cf3 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -539,19 +539,18 @@ static void __init ms_hyperv_init_platform(void) */ x86_platform.apic_post_init = hyperv_init; hyperv_setup_mmu_ops(); - /* Setup the IDT for hypervisor callback */ - alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, asm_sysvec_hyperv_callback); - /* Setup the IDT for reenlightenment notifications */ + /* Install system interrupt handler for hypervisor callback */ + sysvec_install(HYPERVISOR_CALLBACK_VECTOR, sysvec_hyperv_callback); + + /* Install system interrupt handler for reenlightenment notifications */ if (ms_hyperv.features & HV_ACCESS_REENLIGHTENMENT) { - alloc_intr_gate(HYPERV_REENLIGHTENMENT_VECTOR, - asm_sysvec_hyperv_reenlightenment); + sysvec_install(HYPERV_REENLIGHTENMENT_VECTOR, sysvec_hyperv_reenlightenment); } - /* Setup the IDT for stimer0 */ + /* Install system interrupt handler for stimer0 */ if (ms_hyperv.misc_features & HV_STIMER_DIRECT_MODE_AVAILABLE) { - alloc_intr_gate(HYPERV_STIMER0_VECTOR, - asm_sysvec_hyperv_stimer0); + sysvec_install(HYPERV_STIMER0_VECTOR, sysvec_hyperv_stimer0); } # ifdef CONFIG_SMP diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 660b601f1d6c33..0cd53fa8c65d1d 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -337,7 +337,7 @@ void idt_invalidate(void) load_idt(&idt); } -void __init alloc_intr_gate(unsigned int n, const void *addr) +void __init idt_install_sysvec(unsigned int n, const void *function) { if (WARN_ON(n < FIRST_SYSTEM_VECTOR)) return; @@ -346,5 +346,5 @@ void __init alloc_intr_gate(unsigned int n, const void *addr) return; if (!WARN_ON(test_and_set_bit(n, system_vectors))) - set_intr_gate(n, addr); + set_intr_gate(n, function); } diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index dfe9945b9becee..b05557918ae20a 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -829,7 +829,7 @@ static void __init kvm_guest_init(void) if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_INT) && kvmapf) { static_branch_enable(&kvm_async_pf_enabled); - alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, asm_sysvec_kvm_asyncpf_interrupt); + sysvec_install(HYPERVISOR_CALLBACK_VECTOR, sysvec_kvm_asyncpf_interrupt); } #ifdef CONFIG_SMP diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c index b8cfea7812d6b6..e2813bac92d40c 100644 --- a/drivers/xen/events/events_base.c +++ b/drivers/xen/events/events_base.c @@ -2216,7 +2216,7 @@ static __init void xen_alloc_callback_vector(void) return; pr_info("Xen HVM callback vector for event delivery is enabled\n"); - alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, asm_sysvec_xen_hvm_callback); + sysvec_install(HYPERVISOR_CALLBACK_VECTOR, sysvec_xen_hvm_callback); } #else void xen_setup_callback_vector(void) {} From 531ff17a705a0f0ecafb8823956f69d5fbfda6fd Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin (Intel)" Date: Tue, 5 Dec 2023 02:50:17 -0800 Subject: [PATCH 363/707] x86/fred: Let ret_from_fork_asm() jmp to asm_fred_exit_user when FRED is enabled Let ret_from_fork_asm() jmp to asm_fred_exit_user when FRED is enabled, otherwise the existing IDT code is chosen. Signed-off-by: H. Peter Anvin (Intel) Signed-off-by: Xin Li Signed-off-by: Thomas Gleixner Tested-by: Shan Kang Link: https://lore.kernel.org/r/20231205105030.8698-29-xin3.li@intel.com --- arch/x86/entry/entry_64.S | 6 ++++++ arch/x86/entry/entry_64_fred.S | 1 + 2 files changed, 7 insertions(+) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 29ce68f8ede043..7c4b7263b8571e 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -247,7 +247,13 @@ SYM_CODE_START(ret_from_fork_asm) * and unwind should work normally. */ UNWIND_HINT_REGS + +#ifdef CONFIG_X86_FRED + ALTERNATIVE "jmp swapgs_restore_regs_and_return_to_usermode", \ + "jmp asm_fred_exit_user", X86_FEATURE_FRED +#else jmp swapgs_restore_regs_and_return_to_usermode +#endif SYM_CODE_END(ret_from_fork_asm) .popsection diff --git a/arch/x86/entry/entry_64_fred.S b/arch/x86/entry/entry_64_fred.S index c1ddaf6b068f4d..2271a1c690dc66 100644 --- a/arch/x86/entry/entry_64_fred.S +++ b/arch/x86/entry/entry_64_fred.S @@ -32,6 +32,7 @@ SYM_CODE_START_NOALIGN(asm_fred_entrypoint_user) FRED_ENTER call fred_entry_from_user +SYM_INNER_LABEL(asm_fred_exit_user, SYM_L_GLOBAL) FRED_EXIT ERETU SYM_CODE_END(asm_fred_entrypoint_user) From ed63bc7d4953bd5fe93a5c3acef7f485fb216208 Mon Sep 17 00:00:00 2001 From: Xin Li Date: Tue, 5 Dec 2023 02:50:18 -0800 Subject: [PATCH 364/707] x86/fred: Fixup fault on ERETU by jumping to fred_entrypoint_user If the stack frame contains an invalid user context (e.g. due to invalid SS, a non-canonical RIP, etc.) the ERETU instruction will trap (#SS or #GP). From a Linux point of view, this really should be considered a user space failure, so use the standard fault fixup mechanism to intercept the fault, fix up the exception frame, and redirect execution to fred_entrypoint_user. The end result is that it appears just as if the hardware had taken the exception immediately after completing the transition to user space. Suggested-by: H. Peter Anvin (Intel) Signed-off-by: Xin Li Signed-off-by: Thomas Gleixner Tested-by: Shan Kang Link: https://lore.kernel.org/r/20231205105030.8698-30-xin3.li@intel.com --- arch/x86/entry/entry_64_fred.S | 5 +- arch/x86/include/asm/extable_fixup_types.h | 4 +- arch/x86/mm/extable.c | 78 ++++++++++++++++++++++ 3 files changed, 85 insertions(+), 2 deletions(-) diff --git a/arch/x86/entry/entry_64_fred.S b/arch/x86/entry/entry_64_fred.S index 2271a1c690dc66..7fe2722ad90c16 100644 --- a/arch/x86/entry/entry_64_fred.S +++ b/arch/x86/entry/entry_64_fred.S @@ -3,6 +3,7 @@ * The actual FRED entry points. */ +#include #include #include "calling.h" @@ -34,7 +35,9 @@ SYM_CODE_START_NOALIGN(asm_fred_entrypoint_user) call fred_entry_from_user SYM_INNER_LABEL(asm_fred_exit_user, SYM_L_GLOBAL) FRED_EXIT - ERETU +1: ERETU + + _ASM_EXTABLE_TYPE(1b, asm_fred_entrypoint_user, EX_TYPE_ERETU) SYM_CODE_END(asm_fred_entrypoint_user) /* diff --git a/arch/x86/include/asm/extable_fixup_types.h b/arch/x86/include/asm/extable_fixup_types.h index fe6312045042f8..7acf0383be8022 100644 --- a/arch/x86/include/asm/extable_fixup_types.h +++ b/arch/x86/include/asm/extable_fixup_types.h @@ -64,6 +64,8 @@ #define EX_TYPE_UCOPY_LEN4 (EX_TYPE_UCOPY_LEN | EX_DATA_IMM(4)) #define EX_TYPE_UCOPY_LEN8 (EX_TYPE_UCOPY_LEN | EX_DATA_IMM(8)) -#define EX_TYPE_ZEROPAD 20 /* longword load with zeropad on fault */ +#define EX_TYPE_ZEROPAD 20 /* longword load with zeropad on fault */ + +#define EX_TYPE_ERETU 21 #endif diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index 271dcb2deabc31..b522933bfa56e8 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -6,6 +6,7 @@ #include #include +#include #include #include #include @@ -223,6 +224,79 @@ static bool ex_handler_ucopy_len(const struct exception_table_entry *fixup, return ex_handler_uaccess(fixup, regs, trapnr, fault_address); } +#ifdef CONFIG_X86_FRED +static bool ex_handler_eretu(const struct exception_table_entry *fixup, + struct pt_regs *regs, unsigned long error_code) +{ + struct pt_regs *uregs = (struct pt_regs *)(regs->sp - offsetof(struct pt_regs, orig_ax)); + unsigned short ss = uregs->ss; + unsigned short cs = uregs->cs; + + /* + * Move the NMI bit from the invalid stack frame, which caused ERETU + * to fault, to the fault handler's stack frame, thus to unblock NMI + * with the fault handler's ERETS instruction ASAP if NMI is blocked. + */ + regs->fred_ss.nmi = uregs->fred_ss.nmi; + + /* + * Sync event information to uregs, i.e., the ERETU return frame, but + * is it safe to write to the ERETU return frame which is just above + * current event stack frame? + * + * The RSP used by FRED to push a stack frame is not the value in %rsp, + * it is calculated from %rsp with the following 2 steps: + * 1) RSP = %rsp - (IA32_FRED_CONFIG & 0x1c0) // Reserve N*64 bytes + * 2) RSP = RSP & ~0x3f // Align to a 64-byte cache line + * when an event delivery doesn't trigger a stack level change. + * + * Here is an example with N*64 (N=1) bytes reserved: + * + * 64-byte cache line ==> ______________ + * |___Reserved___| + * |__Event_data__| + * |_____SS_______| + * |_____RSP______| + * |_____FLAGS____| + * |_____CS_______| + * |_____IP_______| + * 64-byte cache line ==> |__Error_code__| <== ERETU return frame + * |______________| + * |______________| + * |______________| + * |______________| + * |______________| + * |______________| + * |______________| + * 64-byte cache line ==> |______________| <== RSP after step 1) and 2) + * |___Reserved___| + * |__Event_data__| + * |_____SS_______| + * |_____RSP______| + * |_____FLAGS____| + * |_____CS_______| + * |_____IP_______| + * 64-byte cache line ==> |__Error_code__| <== ERETS return frame + * + * Thus a new FRED stack frame will always be pushed below a previous + * FRED stack frame ((N*64) bytes may be reserved between), and it is + * safe to write to a previous FRED stack frame as they never overlap. + */ + fred_info(uregs)->edata = fred_event_data(regs); + uregs->ssx = regs->ssx; + uregs->fred_ss.ss = ss; + /* The NMI bit was moved away above */ + uregs->fred_ss.nmi = 0; + uregs->csx = regs->csx; + uregs->fred_cs.sl = 0; + uregs->fred_cs.wfe = 0; + uregs->cs = cs; + uregs->orig_ax = error_code; + + return ex_handler_default(fixup, regs); +} +#endif + int ex_get_fixup_type(unsigned long ip) { const struct exception_table_entry *e = search_exception_tables(ip); @@ -300,6 +374,10 @@ int fixup_exception(struct pt_regs *regs, int trapnr, unsigned long error_code, return ex_handler_ucopy_len(e, regs, trapnr, fault_addr, reg, imm); case EX_TYPE_ZEROPAD: return ex_handler_zeropad(e, regs, fault_addr); +#ifdef CONFIG_X86_FRED + case EX_TYPE_ERETU: + return ex_handler_eretu(e, regs, error_code); +#endif } BUG(); } From 8c968f4df73c62be94229c7dbbb330ba9fadbd50 Mon Sep 17 00:00:00 2001 From: "Peter Zijlstra (Intel)" Date: Tue, 5 Dec 2023 02:50:19 -0800 Subject: [PATCH 365/707] x86/entry/calling: Allow PUSH_AND_CLEAR_REGS being used beyond actual entry code PUSH_AND_CLEAR_REGS could be used besides actual entry code; in that case %rbp shouldn't be cleared (otherwise the frame pointer is destroyed) and UNWIND_HINT shouldn't be added. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Xin Li Signed-off-by: Thomas Gleixner Tested-by: Shan Kang Link: https://lore.kernel.org/r/20231205105030.8698-31-xin3.li@intel.com --- arch/x86/entry/calling.h | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index 9f1d94790a5491..3ff925b17b7ed5 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -65,7 +65,7 @@ For 32-bit we have the following conventions - kernel is built with * for assembly code: */ -.macro PUSH_REGS rdx=%rdx rcx=%rcx rax=%rax save_ret=0 +.macro PUSH_REGS rdx=%rdx rcx=%rcx rax=%rax save_ret=0 unwind_hint=1 .if \save_ret pushq %rsi /* pt_regs->si */ movq 8(%rsp), %rsi /* temporarily store the return address in %rsi */ @@ -87,14 +87,17 @@ For 32-bit we have the following conventions - kernel is built with pushq %r13 /* pt_regs->r13 */ pushq %r14 /* pt_regs->r14 */ pushq %r15 /* pt_regs->r15 */ + + .if \unwind_hint UNWIND_HINT_REGS + .endif .if \save_ret pushq %rsi /* return address on top of stack */ .endif .endm -.macro CLEAR_REGS +.macro CLEAR_REGS clear_bp=1 /* * Sanitize registers of values that a speculation attack might * otherwise want to exploit. The lower registers are likely clobbered @@ -109,7 +112,9 @@ For 32-bit we have the following conventions - kernel is built with xorl %r10d, %r10d /* nospec r10 */ xorl %r11d, %r11d /* nospec r11 */ xorl %ebx, %ebx /* nospec rbx */ + .if \clear_bp xorl %ebp, %ebp /* nospec rbp */ + .endif xorl %r12d, %r12d /* nospec r12 */ xorl %r13d, %r13d /* nospec r13 */ xorl %r14d, %r14d /* nospec r14 */ @@ -117,9 +122,9 @@ For 32-bit we have the following conventions - kernel is built with .endm -.macro PUSH_AND_CLEAR_REGS rdx=%rdx rcx=%rcx rax=%rax save_ret=0 - PUSH_REGS rdx=\rdx, rcx=\rcx, rax=\rax, save_ret=\save_ret - CLEAR_REGS +.macro PUSH_AND_CLEAR_REGS rdx=%rdx rcx=%rcx rax=%rax save_ret=0 clear_bp=1 unwind_hint=1 + PUSH_REGS rdx=\rdx, rcx=\rcx, rax=\rax, save_ret=\save_ret unwind_hint=\unwind_hint + CLEAR_REGS clear_bp=\clear_bp .endm .macro POP_REGS pop_rdi=1 From d8fbd04962865730bb67106e862bfbe363a9c284 Mon Sep 17 00:00:00 2001 From: Xin Li Date: Tue, 5 Dec 2023 02:50:20 -0800 Subject: [PATCH 366/707] x86/entry: Add fred_entry_from_kvm() for VMX to handle IRQ/NMI In IRQ/NMI induced VM exits, KVM VMX needs to execute the respective handlers, which requires the software to create a FRED stack frame, and use it to invoke the handlers. Add fred_irq_entry_from_kvm() for this job. Export fred_entry_from_kvm() because VMX can be compiled as a module. Suggested-by: Sean Christopherson Suggested-by: Thomas Gleixner Signed-off-by: Xin Li Signed-off-by: Thomas Gleixner Tested-by: Shan Kang Link: https://lore.kernel.org/r/20231205105030.8698-32-xin3.li@intel.com --- arch/x86/entry/entry_64_fred.S | 77 ++++++++++++++++++++++++++++++++++ arch/x86/entry/entry_fred.c | 14 +++++++ arch/x86/include/asm/fred.h | 18 ++++++++ 3 files changed, 109 insertions(+) diff --git a/arch/x86/entry/entry_64_fred.S b/arch/x86/entry/entry_64_fred.S index 7fe2722ad90c16..a02bc6f3d2e6a4 100644 --- a/arch/x86/entry/entry_64_fred.S +++ b/arch/x86/entry/entry_64_fred.S @@ -3,8 +3,11 @@ * The actual FRED entry points. */ +#include + #include #include +#include #include "calling.h" @@ -52,3 +55,77 @@ SYM_CODE_START_NOALIGN(asm_fred_entrypoint_kernel) FRED_EXIT ERETS SYM_CODE_END(asm_fred_entrypoint_kernel) + +#if IS_ENABLED(CONFIG_KVM_INTEL) +SYM_FUNC_START(asm_fred_entry_from_kvm) + push %rbp + mov %rsp, %rbp + + UNWIND_HINT_SAVE + + /* + * Both IRQ and NMI from VMX can be handled on current task stack + * because there is no need to protect from reentrancy and the call + * stack leading to this helper is effectively constant and shallow + * (relatively speaking). Do the same when FRED is active, i.e., no + * need to check current stack level for a stack switch. + * + * Emulate the FRED-defined redzone and stack alignment. + */ + sub $(FRED_CONFIG_REDZONE_AMOUNT << 6), %rsp + and $FRED_STACK_FRAME_RSP_MASK, %rsp + + /* + * Start to push a FRED stack frame, which is always 64 bytes: + * + * +--------+-----------------+ + * | Bytes | Usage | + * +--------+-----------------+ + * | 63:56 | Reserved | + * | 55:48 | Event Data | + * | 47:40 | SS + Event Info | + * | 39:32 | RSP | + * | 31:24 | RFLAGS | + * | 23:16 | CS + Aux Info | + * | 15:8 | RIP | + * | 7:0 | Error Code | + * +--------+-----------------+ + */ + push $0 /* Reserved, must be 0 */ + push $0 /* Event data, 0 for IRQ/NMI */ + push %rdi /* fred_ss handed in by the caller */ + push %rbp + pushf + mov $__KERNEL_CS, %rax + push %rax + + /* + * Unlike the IDT event delivery, FRED _always_ pushes an error code + * after pushing the return RIP, thus the CALL instruction CANNOT be + * used here to push the return RIP, otherwise there is no chance to + * push an error code before invoking the IRQ/NMI handler. + * + * Use LEA to get the return RIP and push it, then push an error code. + */ + lea 1f(%rip), %rax + push %rax /* Return RIP */ + push $0 /* Error code, 0 for IRQ/NMI */ + + PUSH_AND_CLEAR_REGS clear_bp=0 unwind_hint=0 + movq %rsp, %rdi /* %rdi -> pt_regs */ + call __fred_entry_from_kvm /* Call the C entry point */ + POP_REGS + ERETS +1: + /* + * Objtool doesn't understand what ERETS does, this hint tells it that + * yes, we'll reach here and with what stack state. A save/restore pair + * isn't strictly needed, but it's the simplest form. + */ + UNWIND_HINT_RESTORE + pop %rbp + RET + +SYM_FUNC_END(asm_fred_entry_from_kvm) +EXPORT_SYMBOL_GPL(asm_fred_entry_from_kvm); +#endif diff --git a/arch/x86/entry/entry_fred.c b/arch/x86/entry/entry_fred.c index 3be0269bc0d465..6ecc08b6d72a24 100644 --- a/arch/x86/entry/entry_fred.c +++ b/arch/x86/entry/entry_fred.c @@ -257,3 +257,17 @@ __visible noinstr void fred_entry_from_kernel(struct pt_regs *regs) return fred_bad_type(regs, error_code); } + +#if IS_ENABLED(CONFIG_KVM_INTEL) +__visible noinstr void __fred_entry_from_kvm(struct pt_regs *regs) +{ + switch (regs->fred_ss.type) { + case EVENT_TYPE_EXTINT: + return fred_extint(regs); + case EVENT_TYPE_NMI: + return fred_exc_nmi(regs); + default: + WARN_ON_ONCE(1); + } +} +#endif diff --git a/arch/x86/include/asm/fred.h b/arch/x86/include/asm/fred.h index 16a64ffecbf8d5..2fa9f34e5c95f5 100644 --- a/arch/x86/include/asm/fred.h +++ b/arch/x86/include/asm/fred.h @@ -9,6 +9,7 @@ #include #include +#include /* * FRED event return instruction opcodes for ERET{S,U}; supported in @@ -62,12 +63,29 @@ static __always_inline unsigned long fred_event_data(struct pt_regs *regs) void asm_fred_entrypoint_user(void); void asm_fred_entrypoint_kernel(void); +void asm_fred_entry_from_kvm(struct fred_ss); __visible void fred_entry_from_user(struct pt_regs *regs); __visible void fred_entry_from_kernel(struct pt_regs *regs); +__visible void __fred_entry_from_kvm(struct pt_regs *regs); + +/* Can be called from noinstr code, thus __always_inline */ +static __always_inline void fred_entry_from_kvm(unsigned int type, unsigned int vector) +{ + struct fred_ss ss = { + .ss =__KERNEL_DS, + .type = type, + .vector = vector, + .nmi = type == EVENT_TYPE_NMI, + .lm = 1, + }; + + asm_fred_entry_from_kvm(ss); +} #else /* CONFIG_X86_FRED */ static __always_inline unsigned long fred_event_data(struct pt_regs *regs) { return 0; } +static __always_inline void fred_entry_from_kvm(unsigned int type, unsigned int vector) { } #endif /* CONFIG_X86_FRED */ #endif /* !__ASSEMBLY__ */ From cb5429aaa0c53d60414a08fb40f8d15d748c4cda Mon Sep 17 00:00:00 2001 From: Xin Li Date: Tue, 5 Dec 2023 02:50:21 -0800 Subject: [PATCH 367/707] KVM: VMX: Call fred_entry_from_kvm() for IRQ/NMI handling When FRED is enabled, call fred_entry_from_kvm() to handle IRQ/NMI in IRQ/NMI induced VM exits. Signed-off-by: Xin Li Signed-off-by: Thomas Gleixner Tested-by: Shan Kang Acked-by: Paolo Bonzini Link: https://lore.kernel.org/r/20231205105030.8698-33-xin3.li@intel.com --- arch/x86/kvm/vmx/vmx.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index e262bc2ba4e569..cce92f701deeed 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include #include @@ -6960,14 +6961,16 @@ static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu) { u32 intr_info = vmx_get_intr_info(vcpu); unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK; - gate_desc *desc = (gate_desc *)host_idt_base + vector; if (KVM_BUG(!is_external_intr(intr_info), vcpu->kvm, "unexpected VM-Exit interrupt info: 0x%x", intr_info)) return; kvm_before_interrupt(vcpu, KVM_HANDLING_IRQ); - vmx_do_interrupt_irqoff(gate_offset(desc)); + if (cpu_feature_enabled(X86_FEATURE_FRED)) + fred_entry_from_kvm(EVENT_TYPE_EXTINT, vector); + else + vmx_do_interrupt_irqoff(gate_offset((gate_desc *)host_idt_base + vector)); kvm_after_interrupt(vcpu); vcpu->arch.at_instruction_boundary = true; @@ -7260,7 +7263,10 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, if ((u16)vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI && is_nmi(vmx_get_intr_info(vcpu))) { kvm_before_interrupt(vcpu, KVM_HANDLING_NMI); - vmx_do_nmi_irqoff(); + if (cpu_feature_enabled(X86_FEATURE_FRED)) + fred_entry_from_kvm(EVENT_TYPE_NMI, NMI_VECTOR); + else + vmx_do_nmi_irqoff(); kvm_after_interrupt(vcpu); } From ae46f3978ae4eb9da013ff9963f105de2db2f8ec Mon Sep 17 00:00:00 2001 From: Xin Li Date: Tue, 5 Dec 2023 02:50:22 -0800 Subject: [PATCH 368/707] x86/syscall: Split IDT syscall setup code into idt_syscall_init() Because FRED uses the ring 3 FRED entrypoint for SYSCALL and SYSENTER and ERETU is the only legit instruction to return to ring 3, there is NO need to setup SYSCALL and SYSENTER MSRs for FRED, except the IA32_STAR MSR. Split IDT syscall setup code into idt_syscall_init() to make it easy to skip syscall setup code when FRED is enabled. Suggested-by: Thomas Gleixner Signed-off-by: Xin Li Signed-off-by: Thomas Gleixner Tested-by: Shan Kang Link: https://lore.kernel.org/r/20231205105030.8698-34-xin3.li@intel.com --- arch/x86/kernel/cpu/common.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index c3a175770df032..4f5e4aa35e5a9e 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -2066,10 +2066,8 @@ static void wrmsrl_cstar(unsigned long val) wrmsrl(MSR_CSTAR, val); } -/* May not be marked __init: used by software suspend */ -void syscall_init(void) +static inline void idt_syscall_init(void) { - wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS); wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64); if (ia32_enabled()) { @@ -2103,6 +2101,15 @@ void syscall_init(void) X86_EFLAGS_AC|X86_EFLAGS_ID); } +/* May not be marked __init: used by software suspend */ +void syscall_init(void) +{ + /* The default user and kernel segments */ + wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS); + + idt_syscall_init(); +} + #else /* CONFIG_X86_64 */ #ifdef CONFIG_STACKPROTECTOR From 43ca697baecf3c90fe108a61cf444de20bbfa5b9 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin (Intel)" Date: Tue, 5 Dec 2023 02:50:23 -0800 Subject: [PATCH 369/707] x86/fred: Add FRED initialization functions Add cpu_init_fred_exceptions() to: - Set FRED entrypoints for events happening in ring 0 and 3. - Specify the stack level for IRQs occurred ring 0. - Specify dedicated event stacks for #DB/NMI/#MCE/#DF. - Enable FRED and invalidtes IDT. - Force 32-bit system calls to use "int $0x80" only. Add fred_complete_exception_setup() to: - Initialize system_vectors as done for IDT systems. - Set unused sysvec_table entries to fred_handle_spurious_interrupt(). Co-developed-by: Xin Li Signed-off-by: H. Peter Anvin (Intel) Signed-off-by: Xin Li Signed-off-by: Thomas Gleixner Tested-by: Shan Kang Link: https://lore.kernel.org/r/20231205105030.8698-35-xin3.li@intel.com --- arch/x86/entry/entry_fred.c | 21 +++++++++++++ arch/x86/include/asm/fred.h | 5 ++++ arch/x86/kernel/Makefile | 1 + arch/x86/kernel/fred.c | 59 +++++++++++++++++++++++++++++++++++++ 4 files changed, 86 insertions(+) create mode 100644 arch/x86/kernel/fred.c diff --git a/arch/x86/entry/entry_fred.c b/arch/x86/entry/entry_fred.c index 6ecc08b6d72a24..ac120cbdaaf2b4 100644 --- a/arch/x86/entry/entry_fred.c +++ b/arch/x86/entry/entry_fred.c @@ -133,6 +133,27 @@ void __init fred_install_sysvec(unsigned int sysvec, idtentry_t handler) sysvec_table[sysvec - FIRST_SYSTEM_VECTOR] = handler; } +static noinstr void fred_handle_spurious_interrupt(struct pt_regs *regs) +{ + spurious_interrupt(regs, regs->fred_ss.vector); +} + +void __init fred_complete_exception_setup(void) +{ + unsigned int vector; + + for (vector = 0; vector < FIRST_EXTERNAL_VECTOR; vector++) + set_bit(vector, system_vectors); + + for (vector = 0; vector < NR_SYSTEM_VECTORS; vector++) { + if (sysvec_table[vector]) + set_bit(vector + FIRST_SYSTEM_VECTOR, system_vectors); + else + sysvec_table[vector] = fred_handle_spurious_interrupt; + } + fred_setup_done = true; +} + static noinstr void fred_extint(struct pt_regs *regs) { unsigned int vector = regs->fred_ss.vector; diff --git a/arch/x86/include/asm/fred.h b/arch/x86/include/asm/fred.h index 2fa9f34e5c95f5..e86c7ba32435f5 100644 --- a/arch/x86/include/asm/fred.h +++ b/arch/x86/include/asm/fred.h @@ -83,8 +83,13 @@ static __always_inline void fred_entry_from_kvm(unsigned int type, unsigned int asm_fred_entry_from_kvm(ss); } +void cpu_init_fred_exceptions(void); +void fred_complete_exception_setup(void); + #else /* CONFIG_X86_FRED */ static __always_inline unsigned long fred_event_data(struct pt_regs *regs) { return 0; } +static inline void cpu_init_fred_exceptions(void) { } +static inline void fred_complete_exception_setup(void) { } static __always_inline void fred_entry_from_kvm(unsigned int type, unsigned int vector) { } #endif /* CONFIG_X86_FRED */ #endif /* !__ASSEMBLY__ */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 0000325ab98f4d..0dcbfc1a4c419f 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -48,6 +48,7 @@ obj-y += platform-quirks.o obj-y += process_$(BITS).o signal.o signal_$(BITS).o obj-y += traps.o idt.o irq.o irq_$(BITS).o dumpstack_$(BITS).o obj-y += time.o ioport.o dumpstack.o nmi.o +obj-$(CONFIG_X86_FRED) += fred.o obj-$(CONFIG_MODIFY_LDT_SYSCALL) += ldt.o obj-$(CONFIG_X86_KERNEL_IBT) += ibt_selftest.o obj-y += setup.o x86_init.o i8259.o irqinit.o diff --git a/arch/x86/kernel/fred.c b/arch/x86/kernel/fred.c new file mode 100644 index 00000000000000..4bcd8791ad96ad --- /dev/null +++ b/arch/x86/kernel/fred.c @@ -0,0 +1,59 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include + +#include +#include +#include +#include + +/* #DB in the kernel would imply the use of a kernel debugger. */ +#define FRED_DB_STACK_LEVEL 1UL +#define FRED_NMI_STACK_LEVEL 2UL +#define FRED_MC_STACK_LEVEL 2UL +/* + * #DF is the highest level because a #DF means "something went wrong + * *while delivering an exception*." The number of cases for which that + * can happen with FRED is drastically reduced and basically amounts to + * "the stack you pointed me to is broken." Thus, always change stacks + * on #DF, which means it should be at the highest level. + */ +#define FRED_DF_STACK_LEVEL 3UL + +#define FRED_STKLVL(vector, lvl) ((lvl) << (2 * (vector))) + +void cpu_init_fred_exceptions(void) +{ + /* When FRED is enabled by default, remove this log message */ + pr_info("Initialize FRED on CPU%d\n", smp_processor_id()); + + wrmsrl(MSR_IA32_FRED_CONFIG, + /* Reserve for CALL emulation */ + FRED_CONFIG_REDZONE | + FRED_CONFIG_INT_STKLVL(0) | + FRED_CONFIG_ENTRYPOINT(asm_fred_entrypoint_user)); + + /* + * The purpose of separate stacks for NMI, #DB and #MC *in the kernel* + * (remember that user space faults are always taken on stack level 0) + * is to avoid overflowing the kernel stack. + */ + wrmsrl(MSR_IA32_FRED_STKLVLS, + FRED_STKLVL(X86_TRAP_DB, FRED_DB_STACK_LEVEL) | + FRED_STKLVL(X86_TRAP_NMI, FRED_NMI_STACK_LEVEL) | + FRED_STKLVL(X86_TRAP_MC, FRED_MC_STACK_LEVEL) | + FRED_STKLVL(X86_TRAP_DF, FRED_DF_STACK_LEVEL)); + + /* The FRED equivalents to IST stacks... */ + wrmsrl(MSR_IA32_FRED_RSP1, __this_cpu_ist_top_va(DB)); + wrmsrl(MSR_IA32_FRED_RSP2, __this_cpu_ist_top_va(NMI)); + wrmsrl(MSR_IA32_FRED_RSP3, __this_cpu_ist_top_va(DF)); + + /* Enable FRED */ + cr4_set_bits(X86_CR4_FRED); + /* Any further IDT use is a bug */ + idt_invalidate(); + + /* Use int $0x80 for 32-bit system calls in FRED mode */ + setup_clear_cpu_cap(X86_FEATURE_SYSENTER32); + setup_clear_cpu_cap(X86_FEATURE_SYSCALL32); +} From b564b0111a3f03d1a92ba87c4b0f054ad1845963 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin (Intel)" Date: Tue, 5 Dec 2023 02:50:24 -0800 Subject: [PATCH 370/707] x86/fred: Invoke FRED initialization code to enable FRED Let cpu_init_exception_handling() call cpu_init_fred_exceptions() to initialize FRED. However if FRED is unavailable or disabled, it falls back to set up TSS IST and initialize IDT. Co-developed-by: Xin Li Signed-off-by: H. Peter Anvin (Intel) Signed-off-by: Xin Li Signed-off-by: Thomas Gleixner Tested-by: Shan Kang Link: https://lore.kernel.org/r/20231205105030.8698-36-xin3.li@intel.com --- arch/x86/kernel/cpu/common.c | 22 +++++++++++++++++----- arch/x86/kernel/irqinit.c | 7 ++++++- arch/x86/kernel/traps.c | 5 ++++- 3 files changed, 27 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 4f5e4aa35e5a9e..cf82e3181f7a61 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -61,6 +61,7 @@ #include #include #include +#include #include #include #include @@ -2107,7 +2108,15 @@ void syscall_init(void) /* The default user and kernel segments */ wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS); - idt_syscall_init(); + /* + * Except the IA32_STAR MSR, there is NO need to setup SYSCALL and + * SYSENTER MSRs for FRED, because FRED uses the ring 3 FRED + * entrypoint for SYSCALL and SYSENTER, and ERETU is the only legit + * instruction to return to ring 3 (both sysexit and sysret cause + * #UD when FRED is enabled). + */ + if (!cpu_feature_enabled(X86_FEATURE_FRED)) + idt_syscall_init(); } #else /* CONFIG_X86_64 */ @@ -2213,8 +2222,9 @@ void cpu_init_exception_handling(void) /* paranoid_entry() gets the CPU number from the GDT */ setup_getcpu(cpu); - /* IST vectors need TSS to be set up. */ - tss_setup_ist(tss); + /* For IDT mode, IST vectors need to be set in TSS. */ + if (!cpu_feature_enabled(X86_FEATURE_FRED)) + tss_setup_ist(tss); tss_setup_io_bitmap(tss); set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss); @@ -2223,8 +2233,10 @@ void cpu_init_exception_handling(void) /* GHCB needs to be setup to handle #VC. */ setup_ghcb(); - /* Finally load the IDT */ - load_current_idt(); + if (cpu_feature_enabled(X86_FEATURE_FRED)) + cpu_init_fred_exceptions(); + else + load_current_idt(); } /* diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index c683666876f1c7..f79c5edc0b892d 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -28,6 +28,7 @@ #include #include #include +#include #include /* @@ -96,7 +97,11 @@ void __init native_init_IRQ(void) /* Execute any quirks before the call gates are initialised: */ x86_init.irqs.pre_vector_init(); - idt_setup_apic_and_irq_gates(); + if (cpu_feature_enabled(X86_FEATURE_FRED)) + fred_complete_exception_setup(); + else + idt_setup_apic_and_irq_gates(); + lapic_assign_system_vectors(); if (!acpi_ioapic && !of_ioapic && nr_legacy_irqs()) { diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 1b19a170f5e35e..6cb31df3d5ffbe 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -1438,7 +1438,10 @@ void __init trap_init(void) /* Initialize TSS before setting up traps so ISTs work */ cpu_init_exception_handling(); + /* Setup traps as cpu_init() might #GP */ - idt_setup_traps(); + if (!cpu_feature_enabled(X86_FEATURE_FRED)) + idt_setup_traps(); + cpu_init(); } From 8e7d967f04df0fa2c2db00f47ac4cd5ea16ade91 Mon Sep 17 00:00:00 2001 From: Patrick Rudolph Date: Tue, 30 Jan 2024 17:49:00 +0530 Subject: [PATCH 371/707] dt-bindings: i2c: pca954x: Add custom properties for MAX7357 Maxim Max7357 has a configuration register to enable additional features. These features aren't enabled by default & its up to board designer to enable the same as it may have unexpected side effects. These should be validated for proper functioning & detection of devices in secondary bus as sometimes it can cause secondary bus being disabled. Add booleans for: - maxim,isolate-stuck-channel - maxim,send-flush-out-sequence - maxim,preconnection-wiggle-test-enable Signed-off-by: Patrick Rudolph Signed-off-by: Naresh Solanki Reviewed-by: Rob Herring Signed-off-by: Andi Shyti --- .../bindings/i2c/i2c-mux-pca954x.yaml | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/Documentation/devicetree/bindings/i2c/i2c-mux-pca954x.yaml b/Documentation/devicetree/bindings/i2c/i2c-mux-pca954x.yaml index 2d7bb998b0e9d2..9aa0585200c9cd 100644 --- a/Documentation/devicetree/bindings/i2c/i2c-mux-pca954x.yaml +++ b/Documentation/devicetree/bindings/i2c/i2c-mux-pca954x.yaml @@ -71,6 +71,23 @@ properties: description: A voltage regulator supplying power to the chip. On PCA9846 the regulator supplies power to VDD2 (core logic) and optionally to VDD1. + maxim,isolate-stuck-channel: + type: boolean + description: Allows to use non faulty channels while a stuck channel is + isolated from the upstream bus. If not set all channels are isolated from + the upstream bus until the fault is cleared. + + maxim,send-flush-out-sequence: + type: boolean + description: Send a flush-out sequence to stuck auxiliary buses + automatically after a stuck channel is being detected. + + maxim,preconnection-wiggle-test-enable: + type: boolean + description: Send a STOP condition to the auxiliary buses when the switch + register activates a channel to detect a stuck high fault. On fault the + channel is isolated from the upstream bus. + required: - compatible - reg @@ -95,6 +112,19 @@ allOf: "#interrupt-cells": false interrupt-controller: false + - if: + not: + properties: + compatible: + contains: + enum: + - maxim,max7357 + then: + properties: + maxim,isolate-stuck-channel: false + maxim,send-flush-out-sequence: false + maxim,preconnection-wiggle-test-enable: false + unevaluatedProperties: false examples: From 6b572ea231236bb3be4b819d92119470ac121a9e Mon Sep 17 00:00:00 2001 From: Patrick Rudolph Date: Tue, 30 Jan 2024 17:49:01 +0530 Subject: [PATCH 372/707] i2c: muxes: pca954x: Enable features on MAX7357 Enable additional features based on DT settings and unconditionally release the shared interrupt pin after 1.6 seconds and allow to use it as reset. These features aren't enabled by default and it's up to board designer to validate for proper functioning and detection of devices in secondary bus as sometimes it can cause secondary bus being disabled. Signed-off-by: Patrick Rudolph Signed-off-by: Naresh Solanki Reviewed-by: Andi Shyti Signed-off-by: Andi Shyti --- drivers/i2c/muxes/i2c-mux-pca954x.c | 43 ++++++++++++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) diff --git a/drivers/i2c/muxes/i2c-mux-pca954x.c b/drivers/i2c/muxes/i2c-mux-pca954x.c index 2219062104fbca..f5dfc33b97c0ab 100644 --- a/drivers/i2c/muxes/i2c-mux-pca954x.c +++ b/drivers/i2c/muxes/i2c-mux-pca954x.c @@ -57,6 +57,20 @@ #define PCA954X_IRQ_OFFSET 4 +/* + * MAX7357's configuration register is writeable after POR, but + * can be locked by setting the basic mode bit. MAX7358 configuration + * register is locked by default and needs to be unlocked first. + * The configuration register holds the following settings: + */ +#define MAX7357_CONF_INT_ENABLE BIT(0) +#define MAX7357_CONF_FLUSH_OUT BIT(1) +#define MAX7357_CONF_RELEASE_INT BIT(2) +#define MAX7357_CONF_DISCON_SINGLE_CHAN BIT(4) +#define MAX7357_CONF_PRECONNECT_TEST BIT(7) + +#define MAX7357_POR_DEFAULT_CONF MAX7357_CONF_INT_ENABLE + enum pca_type { max_7356, max_7357, @@ -470,7 +484,34 @@ static int pca954x_init(struct i2c_client *client, struct pca954x *data) else data->last_chan = 0; /* Disconnect multiplexer */ - ret = i2c_smbus_write_byte(client, data->last_chan); + if (device_is_compatible(&client->dev, "maxim,max7357")) { + if (i2c_check_functionality(client->adapter, I2C_FUNC_SMBUS_WRITE_BYTE_DATA)) { + u8 conf = MAX7357_POR_DEFAULT_CONF; + /* + * The interrupt signal is shared with the reset pin. Release the + * interrupt after 1.6 seconds to allow using the pin as reset. + */ + conf |= MAX7357_CONF_RELEASE_INT; + + if (device_property_read_bool(&client->dev, "maxim,isolate-stuck-channel")) + conf |= MAX7357_CONF_DISCON_SINGLE_CHAN; + if (device_property_read_bool(&client->dev, + "maxim,send-flush-out-sequence")) + conf |= MAX7357_CONF_FLUSH_OUT; + if (device_property_read_bool(&client->dev, + "maxim,preconnection-wiggle-test-enable")) + conf |= MAX7357_CONF_PRECONNECT_TEST; + + ret = i2c_smbus_write_byte_data(client, data->last_chan, conf); + } else { + dev_warn(&client->dev, "Write byte data not supported." + "Cannot enable enhanced mode features\n"); + ret = i2c_smbus_write_byte(client, data->last_chan); + } + } else { + ret = i2c_smbus_write_byte(client, data->last_chan); + } + if (ret < 0) data->last_chan = 0; From 6ae2b145edd725c2234c7fde36ebcc5e1a4d4e7d Mon Sep 17 00:00:00 2001 From: Chen-Yu Tsai Date: Sun, 28 Jan 2024 00:32:45 +0800 Subject: [PATCH 373/707] arm64: dts: allwinner: h6: Add RX DMA channel for SPDIF The SPDIF hardware found on the H6 supports both transmit and receive functions. However it is missing the RX DMA channel. Add the SPDIF hardware block's RX DMA channel. Also remove the by-default pinmux, since the end device can choose to implement either or both functionalities. Fixes: f95b598df419 ("arm64: dts: allwinner: Add SPDIF node for Allwinner H6") Signed-off-by: Chen-Yu Tsai Reviewed-by: Andre Przywara Reviewed-by: Jernej Skrabec Link: https://lore.kernel.org/r/20240127163247.384439-6-wens@kernel.org Signed-off-by: Jernej Skrabec --- arch/arm64/boot/dts/allwinner/sun50i-h6-beelink-gs1.dts | 2 ++ arch/arm64/boot/dts/allwinner/sun50i-h6-tanix.dtsi | 2 ++ arch/arm64/boot/dts/allwinner/sun50i-h6.dtsi | 7 +++---- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/arch/arm64/boot/dts/allwinner/sun50i-h6-beelink-gs1.dts b/arch/arm64/boot/dts/allwinner/sun50i-h6-beelink-gs1.dts index 9ec49ac2f6fd5d..381d58cea092d9 100644 --- a/arch/arm64/boot/dts/allwinner/sun50i-h6-beelink-gs1.dts +++ b/arch/arm64/boot/dts/allwinner/sun50i-h6-beelink-gs1.dts @@ -291,6 +291,8 @@ }; &spdif { + pinctrl-names = "default"; + pinctrl-0 = <&spdif_tx_pin>; status = "okay"; }; diff --git a/arch/arm64/boot/dts/allwinner/sun50i-h6-tanix.dtsi b/arch/arm64/boot/dts/allwinner/sun50i-h6-tanix.dtsi index 4903d6358112de..855b7d43bc503a 100644 --- a/arch/arm64/boot/dts/allwinner/sun50i-h6-tanix.dtsi +++ b/arch/arm64/boot/dts/allwinner/sun50i-h6-tanix.dtsi @@ -166,6 +166,8 @@ }; &spdif { + pinctrl-names = "default"; + pinctrl-0 = <&spdif_tx_pin>; status = "okay"; }; diff --git a/arch/arm64/boot/dts/allwinner/sun50i-h6.dtsi b/arch/arm64/boot/dts/allwinner/sun50i-h6.dtsi index ca1d287a0a01d9..d11e5041bae9a4 100644 --- a/arch/arm64/boot/dts/allwinner/sun50i-h6.dtsi +++ b/arch/arm64/boot/dts/allwinner/sun50i-h6.dtsi @@ -406,6 +406,7 @@ function = "spi1"; }; + /omit-if-no-ref/ spdif_tx_pin: spdif-tx-pin { pins = "PH7"; function = "spdif"; @@ -655,10 +656,8 @@ clocks = <&ccu CLK_BUS_SPDIF>, <&ccu CLK_SPDIF>; clock-names = "apb", "spdif"; resets = <&ccu RST_BUS_SPDIF>; - dmas = <&dma 2>; - dma-names = "tx"; - pinctrl-names = "default"; - pinctrl-0 = <&spdif_tx_pin>; + dmas = <&dma 2>, <&dma 2>; + dma-names = "rx", "tx"; status = "disabled"; }; From adb3ebfc285eb2c0ad67039bda58683be415647f Mon Sep 17 00:00:00 2001 From: Chen-Yu Tsai Date: Sun, 28 Jan 2024 00:32:46 +0800 Subject: [PATCH 374/707] arm64: dts: allwinner: h616: Add DMA controller and DMA channels The DMA controllers found on the H616 and H618 are the same as the one found on the A100. The only difference is the DMA endpoint (DRQ) layout. Add a device node for it, and add DMA channels for existing peripherals. Signed-off-by: Chen-Yu Tsai Reviewed-by: Andre Przywara Link: https://lore.kernel.org/r/20240127163247.384439-7-wens@kernel.org Signed-off-by: Jernej Skrabec --- .../arm64/boot/dts/allwinner/sun50i-h616.dtsi | 41 +++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/arch/arm64/boot/dts/allwinner/sun50i-h616.dtsi b/arch/arm64/boot/dts/allwinner/sun50i-h616.dtsi index d549d277d9729f..885809137b9de3 100644 --- a/arch/arm64/boot/dts/allwinner/sun50i-h616.dtsi +++ b/arch/arm64/boot/dts/allwinner/sun50i-h616.dtsi @@ -133,6 +133,19 @@ #reset-cells = <1>; }; + dma: dma-controller@3002000 { + compatible = "allwinner,sun50i-h616-dma", + "allwinner,sun50i-a100-dma"; + reg = <0x03002000 0x1000>; + interrupts = ; + clocks = <&ccu CLK_BUS_DMA>, <&ccu CLK_MBUS_DMA>; + clock-names = "bus", "mbus"; + dma-channels = <16>; + dma-requests = <49>; + resets = <&ccu RST_BUS_DMA>; + #dma-cells = <1>; + }; + sid: efuse@3006000 { compatible = "allwinner,sun50i-h616-sid", "allwinner,sun50i-a64-sid"; reg = <0x03006000 0x1000>; @@ -339,6 +352,8 @@ reg-shift = <2>; reg-io-width = <4>; clocks = <&ccu CLK_BUS_UART0>; + dmas = <&dma 14>, <&dma 14>; + dma-names = "tx", "rx"; resets = <&ccu RST_BUS_UART0>; status = "disabled"; }; @@ -350,6 +365,8 @@ reg-shift = <2>; reg-io-width = <4>; clocks = <&ccu CLK_BUS_UART1>; + dmas = <&dma 15>, <&dma 15>; + dma-names = "tx", "rx"; resets = <&ccu RST_BUS_UART1>; status = "disabled"; }; @@ -361,6 +378,8 @@ reg-shift = <2>; reg-io-width = <4>; clocks = <&ccu CLK_BUS_UART2>; + dmas = <&dma 16>, <&dma 16>; + dma-names = "tx", "rx"; resets = <&ccu RST_BUS_UART2>; status = "disabled"; }; @@ -372,6 +391,8 @@ reg-shift = <2>; reg-io-width = <4>; clocks = <&ccu CLK_BUS_UART3>; + dmas = <&dma 17>, <&dma 17>; + dma-names = "tx", "rx"; resets = <&ccu RST_BUS_UART3>; status = "disabled"; }; @@ -383,6 +404,8 @@ reg-shift = <2>; reg-io-width = <4>; clocks = <&ccu CLK_BUS_UART4>; + dmas = <&dma 18>, <&dma 18>; + dma-names = "tx", "rx"; resets = <&ccu RST_BUS_UART4>; status = "disabled"; }; @@ -394,6 +417,8 @@ reg-shift = <2>; reg-io-width = <4>; clocks = <&ccu CLK_BUS_UART5>; + dmas = <&dma 19>, <&dma 19>; + dma-names = "tx", "rx"; resets = <&ccu RST_BUS_UART5>; status = "disabled"; }; @@ -405,6 +430,8 @@ reg = <0x05002000 0x400>; interrupts = ; clocks = <&ccu CLK_BUS_I2C0>; + dmas = <&dma 43>, <&dma 43>; + dma-names = "rx", "tx"; resets = <&ccu RST_BUS_I2C0>; pinctrl-names = "default"; pinctrl-0 = <&i2c0_pins>; @@ -420,6 +447,8 @@ reg = <0x05002400 0x400>; interrupts = ; clocks = <&ccu CLK_BUS_I2C1>; + dmas = <&dma 44>, <&dma 44>; + dma-names = "rx", "tx"; resets = <&ccu RST_BUS_I2C1>; status = "disabled"; #address-cells = <1>; @@ -433,6 +462,8 @@ reg = <0x05002800 0x400>; interrupts = ; clocks = <&ccu CLK_BUS_I2C2>; + dmas = <&dma 45>, <&dma 45>; + dma-names = "rx", "tx"; resets = <&ccu RST_BUS_I2C2>; status = "disabled"; #address-cells = <1>; @@ -446,6 +477,8 @@ reg = <0x05002c00 0x400>; interrupts = ; clocks = <&ccu CLK_BUS_I2C3>; + dmas = <&dma 46>, <&dma 46>; + dma-names = "rx", "tx"; resets = <&ccu RST_BUS_I2C3>; status = "disabled"; #address-cells = <1>; @@ -459,6 +492,8 @@ reg = <0x05003000 0x400>; interrupts = ; clocks = <&ccu CLK_BUS_I2C4>; + dmas = <&dma 47>, <&dma 47>; + dma-names = "rx", "tx"; resets = <&ccu RST_BUS_I2C4>; status = "disabled"; #address-cells = <1>; @@ -472,6 +507,8 @@ interrupts = ; clocks = <&ccu CLK_BUS_SPI0>, <&ccu CLK_SPI0>; clock-names = "ahb", "mod"; + dmas = <&dma 22>, <&dma 22>; + dma-names = "rx", "tx"; resets = <&ccu RST_BUS_SPI0>; status = "disabled"; #address-cells = <1>; @@ -485,6 +522,8 @@ interrupts = ; clocks = <&ccu CLK_BUS_SPI1>, <&ccu CLK_SPI1>; clock-names = "ahb", "mod"; + dmas = <&dma 23>, <&dma 23>; + dma-names = "rx", "tx"; resets = <&ccu RST_BUS_SPI1>; status = "disabled"; #address-cells = <1>; @@ -734,6 +773,8 @@ reg = <0x07081400 0x400>; interrupts = ; clocks = <&r_ccu CLK_R_APB2_I2C>; + dmas = <&dma 48>, <&dma 48>; + dma-names = "rx", "tx"; resets = <&r_ccu RST_R_APB2_I2C>; status = "disabled"; #address-cells = <1>; From 7ef7d495bb10fe338f85f0769e6a3cac4ebf2d74 Mon Sep 17 00:00:00 2001 From: Chen-Yu Tsai Date: Sun, 28 Jan 2024 00:32:47 +0800 Subject: [PATCH 375/707] arm64: dts: allwinner: h616: Add SPDIF device node The H616 SoC has an SPDIF transmitter hardware block, which has the same layout as the one in the H6, minus the receiver side. Add a device node for it, and a default pinmux. Signed-off-by: Chen-Yu Tsai Reviewed-by: Andre Przywara Acked-by: Jernej Skrabec Link: https://lore.kernel.org/r/20240127163247.384439-8-wens@kernel.org Signed-off-by: Jernej Skrabec --- .../arm64/boot/dts/allwinner/sun50i-h616.dtsi | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/arch/arm64/boot/dts/allwinner/sun50i-h616.dtsi b/arch/arm64/boot/dts/allwinner/sun50i-h616.dtsi index 885809137b9de3..b1bf4fb5fc58b8 100644 --- a/arch/arm64/boot/dts/allwinner/sun50i-h616.dtsi +++ b/arch/arm64/boot/dts/allwinner/sun50i-h616.dtsi @@ -253,6 +253,11 @@ function = "spi1"; }; + spdif_tx_pin: spdif-tx-pin { + pins = "PH4"; + function = "spdif"; + }; + uart0_ph_pins: uart0-ph-pins { pins = "PH0", "PH1"; function = "uart0"; @@ -550,6 +555,21 @@ }; }; + spdif: spdif@5093000 { + compatible = "allwinner,sun50i-h616-spdif"; + reg = <0x05093000 0x400>; + interrupts = ; + clocks = <&ccu CLK_BUS_SPDIF>, <&ccu CLK_SPDIF>; + clock-names = "apb", "spdif"; + resets = <&ccu RST_BUS_SPDIF>; + dmas = <&dma 2>; + dma-names = "tx"; + pinctrl-names = "default"; + pinctrl-0 = <&spdif_tx_pin>; + #sound-dai-cells = <0>; + status = "disabled"; + }; + usbotg: usb@5100000 { compatible = "allwinner,sun50i-h616-musb", "allwinner,sun8i-h3-musb"; From a0868e7c5575ec87d77d0c0924e6812b20e2d879 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Thu, 11 Jan 2024 14:59:01 +0100 Subject: [PATCH 376/707] KVM: selftests: Compare wall time from xen shinfo against KVM_GET_CLOCK xen_shinfo_test is observed to be flaky failing sporadically with "VM time too old". With min_ts/max_ts debug print added: Wall clock (v 3269818) 1704906491.986255664 Time info 1: v 1282712 tsc 33530585736 time 14014430025 mul 3587552223 shift 4294967295 flags 1 Time info 2: v 1282712 tsc 33530585736 time 14014430025 mul 3587552223 shift 4294967295 flags 1 min_ts: 1704906491.986312153 max_ts: 1704906506.001006963 ==== Test Assertion Failure ==== x86_64/xen_shinfo_test.c:1003: cmp_timespec(&min_ts, &vm_ts) <= 0 pid=32724 tid=32724 errno=4 - Interrupted system call 1 0x00000000004030ad: main at xen_shinfo_test.c:1003 2 0x00007fca6b23feaf: ?? ??:0 3 0x00007fca6b23ff5f: ?? ??:0 4 0x0000000000405e04: _start at ??:? VM time too old The test compares wall clock data from shinfo (which is the output of kvm_get_wall_clock_epoch()) against clock_gettime(CLOCK_REALTIME) in the host system before the VM is created. In the example above, it compares shinfo: 1704906491.986255664 vs min_ts: 1704906491.986312153 and fails as the later is greater than the former. While this sounds like a sane test, it doesn't pass reality check: kvm_get_wall_clock_epoch() calculates guest's epoch (realtime when the guest was created) by subtracting kvmclock from the current realtime and the calculation happens when shinfo is setup. The problem is that kvmclock is a raw clock and realtime clock is affected by NTP. This means that if realtime ticks with a slightly reduced frequency, "guest's epoch" calculated by kvm_get_wall_clock_epoch() will actually tick backwards! This is not a big issue from guest's perspective as the guest can't really observe this but this epoch can't be compared with a fixed clock_gettime() on the host. Replace the check with comparing wall clock data from shinfo to KVM_GET_CLOCK. The later gives both realtime and kvmclock so guest's epoch can be calculated by subtraction. Note, the computed epoch may still differ a few nanoseconds from shinfo as different TSC is used and there are rounding errors but 100 nanoseconds margin should be enough to cover it (famous last words). Reported-by: Jan Richter Signed-off-by: Vitaly Kuznetsov Link: https://lore.kernel.org/r/20240111135901.1785096-1-vkuznets@redhat.com Signed-off-by: Sean Christopherson --- .../selftests/kvm/x86_64/xen_shinfo_test.c | 36 ++++++++----------- 1 file changed, 14 insertions(+), 22 deletions(-) diff --git a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c index 9ec9ab60b63ee2..5e1ad243d95dc0 100644 --- a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c +++ b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c @@ -375,20 +375,6 @@ static void guest_code(void) GUEST_SYNC(TEST_DONE); } -static int cmp_timespec(struct timespec *a, struct timespec *b) -{ - if (a->tv_sec > b->tv_sec) - return 1; - else if (a->tv_sec < b->tv_sec) - return -1; - else if (a->tv_nsec > b->tv_nsec) - return 1; - else if (a->tv_nsec < b->tv_nsec) - return -1; - else - return 0; -} - static struct vcpu_info *vinfo; static struct kvm_vcpu *vcpu; @@ -425,7 +411,6 @@ static void *juggle_shinfo_state(void *arg) int main(int argc, char *argv[]) { - struct timespec min_ts, max_ts, vm_ts; struct kvm_xen_hvm_attr evt_reset; struct kvm_vm *vm; pthread_t thread; @@ -443,8 +428,6 @@ int main(int argc, char *argv[]) bool do_eventfd_tests = !!(xen_caps & KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL); bool do_evtchn_tests = do_eventfd_tests && !!(xen_caps & KVM_XEN_HVM_CONFIG_EVTCHN_SEND); - clock_gettime(CLOCK_REALTIME, &min_ts); - vm = vm_create_with_one_vcpu(&vcpu, guest_code); /* Map a region for the shared_info page */ @@ -969,7 +952,6 @@ int main(int argc, char *argv[]) vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &evt_reset); alarm(0); - clock_gettime(CLOCK_REALTIME, &max_ts); /* * Just a *really* basic check that things are being put in the @@ -978,11 +960,16 @@ int main(int argc, char *argv[]) */ struct pvclock_wall_clock *wc; struct pvclock_vcpu_time_info *ti, *ti2; + struct kvm_clock_data kcdata; + long long delta; wc = addr_gpa2hva(vm, SHINFO_REGION_GPA + 0xc00); ti = addr_gpa2hva(vm, SHINFO_REGION_GPA + 0x40 + 0x20); ti2 = addr_gpa2hva(vm, PVTIME_ADDR); + vm_ioctl(vm, KVM_GET_CLOCK, &kcdata); + delta = (wc->sec * NSEC_PER_SEC + wc->nsec) - (kcdata.realtime - kcdata.clock); + if (verbose) { printf("Wall clock (v %d) %d.%09d\n", wc->version, wc->sec, wc->nsec); printf("Time info 1: v %u tsc %" PRIu64 " time %" PRIu64 " mul %u shift %u flags %x\n", @@ -991,14 +978,19 @@ int main(int argc, char *argv[]) printf("Time info 2: v %u tsc %" PRIu64 " time %" PRIu64 " mul %u shift %u flags %x\n", ti2->version, ti2->tsc_timestamp, ti2->system_time, ti2->tsc_to_system_mul, ti2->tsc_shift, ti2->flags); + printf("KVM_GET_CLOCK realtime: %lld.%09lld\n", kcdata.realtime / NSEC_PER_SEC, + kcdata.realtime % NSEC_PER_SEC); + printf("KVM_GET_CLOCK clock: %lld.%09lld\n", kcdata.clock / NSEC_PER_SEC, + kcdata.clock % NSEC_PER_SEC); } - vm_ts.tv_sec = wc->sec; - vm_ts.tv_nsec = wc->nsec; TEST_ASSERT(wc->version && !(wc->version & 1), "Bad wallclock version %x", wc->version); - TEST_ASSERT(cmp_timespec(&min_ts, &vm_ts) <= 0, "VM time too old"); - TEST_ASSERT(cmp_timespec(&max_ts, &vm_ts) >= 0, "VM time too new"); + + TEST_ASSERT(llabs(delta) < 100, + "Guest's epoch from shinfo %d.%09d differs from KVM_GET_CLOCK %lld.%lld", + wc->sec, wc->nsec, (kcdata.realtime - kcdata.clock) / NSEC_PER_SEC, + (kcdata.realtime - kcdata.clock) % NSEC_PER_SEC); TEST_ASSERT(ti->version && !(ti->version & 1), "Bad time_info version %x", ti->version); From 9f14f46a276521c92cdffb0fc36f907e868d3888 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Fri, 22 Sep 2023 21:34:13 +0200 Subject: [PATCH 377/707] i2c: i801: Replace magic value with constant in dmi_check_onboard_devices Replace magic number 10 with the appropriate constant. Signed-off-by: Heiner Kallweit Reviewed-by: Andi Shyti Signed-off-by: Andi Shyti --- drivers/i2c/busses/i2c-i801.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/i2c/busses/i2c-i801.c b/drivers/i2c/busses/i2c-i801.c index 3932e8d96a1717..8af944bd89f219 100644 --- a/drivers/i2c/busses/i2c-i801.c +++ b/drivers/i2c/busses/i2c-i801.c @@ -1117,7 +1117,7 @@ static void dmi_check_onboard_devices(const struct dmi_header *dm, void *adap) { int i, count; - if (dm->type != 10) + if (dm->type != DMI_ENTRY_ONBOARD_DEVICE) return; count = (dm->length - sizeof(struct dmi_header)) / 2; From 96b125361866d998471c1380f809f2a2b4db60c0 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Fri, 22 Sep 2023 21:35:00 +0200 Subject: [PATCH 378/707] i2c: i801: Remove unused argument from tco functions Argument priv isn't used, so remove it. Signed-off-by: Heiner Kallweit Reviewed-by: Andi Shyti Signed-off-by: Andi Shyti --- drivers/i2c/busses/i2c-i801.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/i2c/busses/i2c-i801.c b/drivers/i2c/busses/i2c-i801.c index 8af944bd89f219..b9b850b69b73db 100644 --- a/drivers/i2c/busses/i2c-i801.c +++ b/drivers/i2c/busses/i2c-i801.c @@ -1445,8 +1445,7 @@ static inline void i801_del_mux(struct i801_priv *priv) { } #endif static struct platform_device * -i801_add_tco_spt(struct i801_priv *priv, struct pci_dev *pci_dev, - struct resource *tco_res) +i801_add_tco_spt(struct pci_dev *pci_dev, struct resource *tco_res) { static const struct itco_wdt_platform_data pldata = { .name = "Intel PCH", @@ -1477,8 +1476,7 @@ i801_add_tco_spt(struct i801_priv *priv, struct pci_dev *pci_dev, } static struct platform_device * -i801_add_tco_cnl(struct i801_priv *priv, struct pci_dev *pci_dev, - struct resource *tco_res) +i801_add_tco_cnl(struct pci_dev *pci_dev, struct resource *tco_res) { static const struct itco_wdt_platform_data pldata = { .name = "Intel PCH", @@ -1518,9 +1516,9 @@ static void i801_add_tco(struct i801_priv *priv) res->flags = IORESOURCE_IO; if (priv->features & FEATURE_TCO_CNL) - priv->tco_pdev = i801_add_tco_cnl(priv, pci_dev, tco_res); + priv->tco_pdev = i801_add_tco_cnl(pci_dev, tco_res); else - priv->tco_pdev = i801_add_tco_spt(priv, pci_dev, tco_res); + priv->tco_pdev = i801_add_tco_spt(pci_dev, tco_res); if (IS_ERR(priv->tco_pdev)) dev_warn(&pci_dev->dev, "failed to create iTCO device\n"); From 449d0d6ccf55d52b707fccf2b7756de4636742df Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 9 Jan 2024 15:11:17 +0100 Subject: [PATCH 379/707] KVM: selftests: Generalize check_clocksource() from kvm_clock_test Several existing x86 selftests need to check that the underlying system clocksource is TSC or based on TSC but every test implements its own check. As a first step towards unification, extract check_clocksource() from kvm_clock_test and split it into two functions: arch-neutral 'sys_get_cur_clocksource()' and x86-specific 'sys_clocksource_is_tsc()'. Fix a couple of pre-existing issues in kvm_clock_test: memory leakage in check_clocksource() and using TEST_ASSERT() instead of TEST_REQUIRE(). The change also makes the test fail when system clocksource can't be read from sysfs. Signed-off-by: Vitaly Kuznetsov Link: https://lore.kernel.org/r/20240109141121.1619463-2-vkuznets@redhat.com [sean: eliminate if-elif pattern just to set a bool true] Signed-off-by: Sean Christopherson --- .../testing/selftests/kvm/include/test_util.h | 2 + .../selftests/kvm/include/x86_64/processor.h | 2 + tools/testing/selftests/kvm/lib/test_util.c | 25 ++++++++++++ .../selftests/kvm/lib/x86_64/processor.c | 10 +++++ .../selftests/kvm/x86_64/kvm_clock_test.c | 38 +------------------ 5 files changed, 40 insertions(+), 37 deletions(-) diff --git a/tools/testing/selftests/kvm/include/test_util.h b/tools/testing/selftests/kvm/include/test_util.h index 71a41fa924b7d0..50a5e31ba8da1b 100644 --- a/tools/testing/selftests/kvm/include/test_util.h +++ b/tools/testing/selftests/kvm/include/test_util.h @@ -195,4 +195,6 @@ __printf(3, 4) int guest_snprintf(char *buf, int n, const char *fmt, ...); char *strdup_printf(const char *fmt, ...) __attribute__((format(printf, 1, 2), nonnull(1))); +char *sys_get_cur_clocksource(void); + #endif /* SELFTEST_KVM_TEST_UTIL_H */ diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index a84863503fcb46..01eec72e0d3e36 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -1271,4 +1271,6 @@ void virt_map_level(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, #define PFERR_GUEST_PAGE_MASK BIT_ULL(PFERR_GUEST_PAGE_BIT) #define PFERR_IMPLICIT_ACCESS BIT_ULL(PFERR_IMPLICIT_ACCESS_BIT) +bool sys_clocksource_is_tsc(void); + #endif /* SELFTEST_KVM_PROCESSOR_H */ diff --git a/tools/testing/selftests/kvm/lib/test_util.c b/tools/testing/selftests/kvm/lib/test_util.c index 5d7f28b02d73ba..5a8f8becb12984 100644 --- a/tools/testing/selftests/kvm/lib/test_util.c +++ b/tools/testing/selftests/kvm/lib/test_util.c @@ -392,3 +392,28 @@ char *strdup_printf(const char *fmt, ...) return str; } + +#define CLOCKSOURCE_PATH "/sys/devices/system/clocksource/clocksource0/current_clocksource" + +char *sys_get_cur_clocksource(void) +{ + char *clk_name; + struct stat st; + FILE *fp; + + fp = fopen(CLOCKSOURCE_PATH, "r"); + TEST_ASSERT(fp, "failed to open clocksource file, errno: %d", errno); + + TEST_ASSERT(!fstat(fileno(fp), &st), "failed to stat clocksource file, errno: %d", + errno); + + clk_name = malloc(st.st_size); + TEST_ASSERT(clk_name, "failed to allocate buffer to read file"); + + TEST_ASSERT(fgets(clk_name, st.st_size, fp), "failed to read clocksource file: %d", + ferror(fp)); + + fclose(fp); + + return clk_name; +} diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index 4bc52948447d8c..e6964ff2a37da7 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -1299,3 +1299,13 @@ void kvm_selftest_arch_init(void) host_cpu_is_intel = this_cpu_is_intel(); host_cpu_is_amd = this_cpu_is_amd(); } + +bool sys_clocksource_is_tsc(void) +{ + char *clk_name = sys_get_cur_clocksource(); + bool ret = !strcmp(clk_name, "tsc\n"); + + free(clk_name); + + return ret; +} diff --git a/tools/testing/selftests/kvm/x86_64/kvm_clock_test.c b/tools/testing/selftests/kvm/x86_64/kvm_clock_test.c index 3e0b7d51abdaa5..6fcc1a43358757 100644 --- a/tools/testing/selftests/kvm/x86_64/kvm_clock_test.c +++ b/tools/testing/selftests/kvm/x86_64/kvm_clock_test.c @@ -132,42 +132,6 @@ static void enter_guest(struct kvm_vcpu *vcpu) } } -#define CLOCKSOURCE_PATH "/sys/devices/system/clocksource/clocksource0/current_clocksource" - -static void check_clocksource(void) -{ - char *clk_name; - struct stat st; - FILE *fp; - - fp = fopen(CLOCKSOURCE_PATH, "r"); - if (!fp) { - pr_info("failed to open clocksource file: %d; assuming TSC.\n", - errno); - return; - } - - if (fstat(fileno(fp), &st)) { - pr_info("failed to stat clocksource file: %d; assuming TSC.\n", - errno); - goto out; - } - - clk_name = malloc(st.st_size); - TEST_ASSERT(clk_name, "failed to allocate buffer to read file"); - - if (!fgets(clk_name, st.st_size, fp)) { - pr_info("failed to read clocksource file: %d; assuming TSC.\n", - ferror(fp)); - goto out; - } - - TEST_ASSERT(!strncmp(clk_name, "tsc\n", st.st_size), - "clocksource not supported: %s", clk_name); -out: - fclose(fp); -} - int main(void) { struct kvm_vcpu *vcpu; @@ -179,7 +143,7 @@ int main(void) flags = kvm_check_cap(KVM_CAP_ADJUST_CLOCK); TEST_REQUIRE(flags & KVM_CLOCK_REALTIME); - check_clocksource(); + TEST_REQUIRE(sys_clocksource_is_tsc()); vm = vm_create_with_one_vcpu(&vcpu, guest_main); From a79036441a68ca04061dda185b5d9cae1d76ed82 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 9 Jan 2024 15:11:18 +0100 Subject: [PATCH 380/707] KVM: selftests: Use generic sys_clocksource_is_tsc() in vmx_nested_tsc_scaling_test Despite its name, system_has_stable_tsc() just checks that system clocksource is 'tsc'; this can now be done with generic sys_clocksource_is_tsc(). No functional change intended. Signed-off-by: Vitaly Kuznetsov Link: https://lore.kernel.org/r/20240109141121.1619463-3-vkuznets@redhat.com Signed-off-by: Sean Christopherson --- .../kvm/x86_64/vmx_nested_tsc_scaling_test.c | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) diff --git a/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c b/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c index e710b6e7fb384a..93b0a850a24003 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c @@ -116,23 +116,6 @@ static void l1_guest_code(struct vmx_pages *vmx_pages) GUEST_DONE(); } -static bool system_has_stable_tsc(void) -{ - bool tsc_is_stable; - FILE *fp; - char buf[4]; - - fp = fopen("/sys/devices/system/clocksource/clocksource0/current_clocksource", "r"); - if (fp == NULL) - return false; - - tsc_is_stable = fgets(buf, sizeof(buf), fp) && - !strncmp(buf, "tsc", sizeof(buf)); - - fclose(fp); - return tsc_is_stable; -} - int main(int argc, char *argv[]) { struct kvm_vcpu *vcpu; @@ -148,7 +131,7 @@ int main(int argc, char *argv[]) TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX)); TEST_REQUIRE(kvm_has_cap(KVM_CAP_TSC_CONTROL)); - TEST_REQUIRE(system_has_stable_tsc()); + TEST_REQUIRE(sys_clocksource_is_tsc()); /* * We set L1's scale factor to be a random number from 2 to 10. From 436e6e541cb27added742ed7998612d5a466223c Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 9 Jan 2024 15:11:19 +0100 Subject: [PATCH 381/707] KVM: selftests: Run clocksource dependent tests with hyperv_clocksource_tsc_page too KVM's 'gtod_is_based_on_tsc()' recognizes two clocksources: 'tsc' and 'hyperv_clocksource_tsc_page' and enables kvmclock in 'masterclock' mode when either is in use. Transform 'sys_clocksource_is_tsc()' into 'sys_clocksource_is_based_on_tsc()' to support the later. This affects two tests: kvm_clock_test and vmx_nested_tsc_scaling_test, both seem to work well when system clocksource is 'hyperv_clocksource_tsc_page'. Signed-off-by: Vitaly Kuznetsov Link: https://lore.kernel.org/r/20240109141121.1619463-4-vkuznets@redhat.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/include/x86_64/processor.h | 2 +- tools/testing/selftests/kvm/lib/x86_64/processor.c | 5 +++-- tools/testing/selftests/kvm/x86_64/kvm_clock_test.c | 2 +- .../selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c | 2 +- 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index 01eec72e0d3e36..5bca8c947c8253 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -1271,6 +1271,6 @@ void virt_map_level(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, #define PFERR_GUEST_PAGE_MASK BIT_ULL(PFERR_GUEST_PAGE_BIT) #define PFERR_IMPLICIT_ACCESS BIT_ULL(PFERR_IMPLICIT_ACCESS_BIT) -bool sys_clocksource_is_tsc(void); +bool sys_clocksource_is_based_on_tsc(void); #endif /* SELFTEST_KVM_PROCESSOR_H */ diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index e6964ff2a37da7..f639b3e062e3a3 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -1300,10 +1300,11 @@ void kvm_selftest_arch_init(void) host_cpu_is_amd = this_cpu_is_amd(); } -bool sys_clocksource_is_tsc(void) +bool sys_clocksource_is_based_on_tsc(void) { char *clk_name = sys_get_cur_clocksource(); - bool ret = !strcmp(clk_name, "tsc\n"); + bool ret = !strcmp(clk_name, "tsc\n") || + !strcmp(clk_name, "hyperv_clocksource_tsc_page\n"); free(clk_name); diff --git a/tools/testing/selftests/kvm/x86_64/kvm_clock_test.c b/tools/testing/selftests/kvm/x86_64/kvm_clock_test.c index 6fcc1a43358757..5bc12222d87af6 100644 --- a/tools/testing/selftests/kvm/x86_64/kvm_clock_test.c +++ b/tools/testing/selftests/kvm/x86_64/kvm_clock_test.c @@ -143,7 +143,7 @@ int main(void) flags = kvm_check_cap(KVM_CAP_ADJUST_CLOCK); TEST_REQUIRE(flags & KVM_CLOCK_REALTIME); - TEST_REQUIRE(sys_clocksource_is_tsc()); + TEST_REQUIRE(sys_clocksource_is_based_on_tsc()); vm = vm_create_with_one_vcpu(&vcpu, guest_main); diff --git a/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c b/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c index 93b0a850a24003..1759fa5cb3f29c 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c @@ -131,7 +131,7 @@ int main(int argc, char *argv[]) TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX)); TEST_REQUIRE(kvm_has_cap(KVM_CAP_TSC_CONTROL)); - TEST_REQUIRE(sys_clocksource_is_tsc()); + TEST_REQUIRE(sys_clocksource_is_based_on_tsc()); /* * We set L1's scale factor to be a random number from 2 to 10. From 14fce852a14b4c74411ec0553322d23f2a740138 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 9 Jan 2024 15:11:20 +0100 Subject: [PATCH 382/707] KVM: selftests: Make hyperv_clock require TSC based system clocksource KVM sets up Hyper-V TSC page clocksource for its guests when system clocksource is 'based on TSC' (see gtod_is_based_on_tsc()), running hyperv_clock with any other clocksource leads to imminent failure. Add the missing requirement to make the test skip gracefully. Signed-off-by: Vitaly Kuznetsov Link: https://lore.kernel.org/r/20240109141121.1619463-5-vkuznets@redhat.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/x86_64/hyperv_clock.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_clock.c b/tools/testing/selftests/kvm/x86_64/hyperv_clock.c index 65690d916db7e6..e058bc676cd693 100644 --- a/tools/testing/selftests/kvm/x86_64/hyperv_clock.c +++ b/tools/testing/selftests/kvm/x86_64/hyperv_clock.c @@ -212,6 +212,7 @@ int main(void) int stage; TEST_REQUIRE(kvm_has_cap(KVM_CAP_HYPERV_TIME)); + TEST_REQUIRE(sys_clocksource_is_based_on_tsc()); vm = vm_create_with_one_vcpu(&vcpu, guest_main); From 57cc5371293416a162dafa4afab9c4f1d7f904c2 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 9 Jan 2024 15:11:21 +0100 Subject: [PATCH 383/707] KVM: x86: Make gtod_is_based_on_tsc() return 'bool' gtod_is_based_on_tsc() is boolean in nature, i.e. it returns '1' for good clocksources and '0' otherwise. Moreover, its result is used raw by kvm_get_time_and_clockread()/kvm_get_walltime_and_clockread() which are 'bool'. No functional change intended. Signed-off-by: Vitaly Kuznetsov Link: https://lore.kernel.org/r/20240109141121.1619463-6-vkuznets@redhat.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 363b1c08020578..aa27aec10860a4 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2507,7 +2507,7 @@ static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns) } #ifdef CONFIG_X86_64 -static inline int gtod_is_based_on_tsc(int mode) +static inline bool gtod_is_based_on_tsc(int mode) { return mode == VDSO_CLOCKMODE_TSC || mode == VDSO_CLOCKMODE_HVCLOCK; } From 11f1357336cde9924da0b455e528f11fbd5011f4 Mon Sep 17 00:00:00 2001 From: Esben Haabendal Date: Thu, 25 Jan 2024 14:56:36 +0100 Subject: [PATCH 384/707] i2c: imx: move to generic GPIO recovery Starting with commit 75820314de26 ("i2c: core: add generic I2C GPIO recovery") GPIO bus recovery is supported by the I2C core, so we can remove the driver implementation and use that one instead. As a nice side-effect, pinctrl becomes optional, allowing bus recovery on LS1021A, which does not have such luxury, but can be wired up to use extra fixed GPIO pins. Note: The previous error messages about bus recovery not being supported is dropped with this change. Given that it is perfectly possible to have platforms where bus recovery works without pinctrl support, I happen to work on one such, both error messages does not really make sense in those cases. And I don't see how to know if this is the case or not. Signed-off-by: Esben Haabendal Acked-by: Oleksij Rempel Signed-off-by: Andi Shyti --- drivers/i2c/busses/i2c-imx.c | 62 +++--------------------------------- 1 file changed, 5 insertions(+), 57 deletions(-) diff --git a/drivers/i2c/busses/i2c-imx.c b/drivers/i2c/busses/i2c-imx.c index 88a053987403cc..d6ba93fc7fee05 100644 --- a/drivers/i2c/busses/i2c-imx.c +++ b/drivers/i2c/busses/i2c-imx.c @@ -212,10 +212,6 @@ struct imx_i2c_struct { const struct imx_i2c_hwdata *hwdata; struct i2c_bus_recovery_info rinfo; - struct pinctrl *pinctrl; - struct pinctrl_state *pinctrl_pins_default; - struct pinctrl_state *pinctrl_pins_gpio; - struct imx_i2c_dma *dma; struct i2c_client *slave; enum i2c_slave_event last_slave_event; @@ -1357,24 +1353,6 @@ static int i2c_imx_xfer_atomic(struct i2c_adapter *adapter, return result; } -static void i2c_imx_prepare_recovery(struct i2c_adapter *adap) -{ - struct imx_i2c_struct *i2c_imx; - - i2c_imx = container_of(adap, struct imx_i2c_struct, adapter); - - pinctrl_select_state(i2c_imx->pinctrl, i2c_imx->pinctrl_pins_gpio); -} - -static void i2c_imx_unprepare_recovery(struct i2c_adapter *adap) -{ - struct imx_i2c_struct *i2c_imx; - - i2c_imx = container_of(adap, struct imx_i2c_struct, adapter); - - pinctrl_select_state(i2c_imx->pinctrl, i2c_imx->pinctrl_pins_default); -} - /* * We switch SCL and SDA to their GPIO function and do some bitbanging * for bus recovery. These alternative pinmux settings can be @@ -1385,43 +1363,13 @@ static void i2c_imx_unprepare_recovery(struct i2c_adapter *adap) static int i2c_imx_init_recovery_info(struct imx_i2c_struct *i2c_imx, struct platform_device *pdev) { - struct i2c_bus_recovery_info *rinfo = &i2c_imx->rinfo; - - i2c_imx->pinctrl = devm_pinctrl_get(&pdev->dev); - if (!i2c_imx->pinctrl) { - dev_info(&pdev->dev, "pinctrl unavailable, bus recovery not supported\n"); - return 0; - } - if (IS_ERR(i2c_imx->pinctrl)) { - dev_info(&pdev->dev, "can't get pinctrl, bus recovery not supported\n"); - return PTR_ERR(i2c_imx->pinctrl); - } - - i2c_imx->pinctrl_pins_default = pinctrl_lookup_state(i2c_imx->pinctrl, - PINCTRL_STATE_DEFAULT); - i2c_imx->pinctrl_pins_gpio = pinctrl_lookup_state(i2c_imx->pinctrl, - "gpio"); - rinfo->sda_gpiod = devm_gpiod_get_optional(&pdev->dev, "sda", GPIOD_IN); - rinfo->scl_gpiod = devm_gpiod_get(&pdev->dev, "scl", GPIOD_OUT_HIGH_OPEN_DRAIN); - - if (PTR_ERR(rinfo->sda_gpiod) == -EPROBE_DEFER || - PTR_ERR(rinfo->scl_gpiod) == -EPROBE_DEFER) { - return -EPROBE_DEFER; - } else if (IS_ERR(rinfo->sda_gpiod) || - IS_ERR(rinfo->scl_gpiod) || - IS_ERR(i2c_imx->pinctrl_pins_default) || - IS_ERR(i2c_imx->pinctrl_pins_gpio)) { - dev_dbg(&pdev->dev, "recovery information incomplete\n"); - return 0; - } + struct i2c_bus_recovery_info *bri = &i2c_imx->rinfo; - dev_dbg(&pdev->dev, "using scl%s for recovery\n", - rinfo->sda_gpiod ? ",sda" : ""); + bri->pinctrl = devm_pinctrl_get(&pdev->dev); + if (IS_ERR(bri->pinctrl)) + return PTR_ERR(bri->pinctrl); - rinfo->prepare_recovery = i2c_imx_prepare_recovery; - rinfo->unprepare_recovery = i2c_imx_unprepare_recovery; - rinfo->recover_bus = i2c_generic_scl_recovery; - i2c_imx->adapter.bus_recovery_info = rinfo; + i2c_imx->adapter.bus_recovery_info = bri; return 0; } From b5d5f64f9bd2dab3b9cebdc1bf52e82367fe352b Mon Sep 17 00:00:00 2001 From: Nuno Sa Date: Mon, 29 Jan 2024 17:13:23 +0100 Subject: [PATCH 385/707] dt-bindings: hwmon: Add LTC4282 bindings Add bindings for the LTC4282 High Current Hot Swap Controller with I2C Compatible Monitoring. Reviewed-by: Conor Dooley Signed-off-by: Nuno Sa Link: https://lore.kernel.org/r/20240129-b4-ltc4282-support-v4-1-fe75798164cc@analog.com Signed-off-by: Guenter Roeck --- .../bindings/hwmon/adi,ltc4282.yaml | 159 ++++++++++++++++++ MAINTAINERS | 6 + 2 files changed, 165 insertions(+) create mode 100644 Documentation/devicetree/bindings/hwmon/adi,ltc4282.yaml diff --git a/Documentation/devicetree/bindings/hwmon/adi,ltc4282.yaml b/Documentation/devicetree/bindings/hwmon/adi,ltc4282.yaml new file mode 100644 index 00000000000000..4854b95a93e341 --- /dev/null +++ b/Documentation/devicetree/bindings/hwmon/adi,ltc4282.yaml @@ -0,0 +1,159 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/hwmon/adi,ltc4282.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Analog Devices LTC4282 I2C High Current Hot Swap Controller over I2C + +maintainers: + - Nuno Sa + +description: | + Analog Devices LTC4282 I2C High Current Hot Swap Controller over I2C. + + https://www.analog.com/media/en/technical-documentation/data-sheets/ltc4282.pdf + +properties: + compatible: + enum: + - adi,ltc4282 + + reg: + maxItems: 1 + + vdd-supply: true + + clocks: + maxItems: 1 + + '#clock-cells': + const: 0 + + adi,rsense-nano-ohms: + description: Value of the sense resistor. + + adi,vin-mode-microvolt: + description: + Selects operating range for the Undervoltage, Overvoltage and Foldback + pins. Also for the ADC. Should be set to the nominal input voltage. + enum: [3300000, 5000000, 12000000, 24000000] + default: 12000000 + + adi,fet-bad-timeout-ms: + description: + From the moment a FET bad conditions is present, this property selects the + wait time/timeout for a FET-bad fault to be signaled. Setting this to 0, + disables FET bad faults to be reported. + default: 255 + maximum: 255 + + adi,overvoltage-dividers: + description: | + Select which dividers to use for VDD Overvoltage detection. Note that + when the internal dividers are used the threshold is referenced to VDD. + The percentages in the datasheet are misleading since the actual values + to look for are in the "Absolute Maximum Ratings" table in the + "Comparator Inputs" section. In there there's a line for each of the 5%, + 10% and 15% settings with the actual min, typical and max tolerances. + $ref: /schemas/types.yaml#/definitions/string + enum: [external, vdd_5_percent, vdd_10_percent, vdd_15_percent] + default: external + + adi,undervoltage-dividers: + description: | + Select which dividers to use for VDD Overvoltage detection. Note that + when the internal dividers are used the threshold is referenced to VDD. + The percentages in the datasheet are misleading since the actual values + to look for are in the "Absolute Maximum Ratings" table in the + "Comparator Inputs" section. In there there's a line for each of the 5%, + 10% and 15% settings with the actual min, typical and max tolerances. + $ref: /schemas/types.yaml#/definitions/string + enum: [external, vdd_5_percent, vdd_10_percent, vdd_15_percent] + default: external + + adi,current-limit-sense-microvolt: + description: + The current limit sense voltage of the chip is adjustable between + 12.5mV and 34.4mV in 3.1mV steps. This effectively limits the current + on the load. + enum: [12500, 15625, 18750, 21875, 25000, 28125, 31250, 34375] + default: 25000 + + adi,overcurrent-retry: + description: + If set, enables the chip to auto-retry 256 timer cycles after an + Overcurrent fault. + type: boolean + + adi,overvoltage-retry-disable: + description: + If set, disables the chip to auto-retry 50ms after an Overvoltage fault. + It's enabled by default. + type: boolean + + adi,undervoltage-retry-disable: + description: + If set, disables the chip to auto-retry 50ms after an Undervoltage fault. + It's enabled by default. + type: boolean + + adi,fault-log-enable: + description: + If set, enables the FAULT_LOG and ADC_ALERT_LOG registers to be written + to the EEPROM when a fault bit transitions high and hence, will be + available after a power cycle (the chip loads the contents of + the EE_FAULT_LOG register - the one in EEPROM - into FAULT_LOG at boot). + type: boolean + + adi,gpio1-mode: + description: Defines the function of the Pin. It can indicate that power is + good (PULL the pin low when power is not good) or that power is bad (Go + into high-z when power is not good). + $ref: /schemas/types.yaml#/definitions/string + enum: [power_bad, power_good] + default: power_good + + adi,gpio2-mode: + description: Defines the function of the Pin. It can be set as the input for + the ADC or indicating that the MOSFET is in stress (dissipating power). + $ref: /schemas/types.yaml#/definitions/string + enum: [adc_input, stress_fet] + default: adc_input + + adi,gpio3-monitor-enable: + description: If set, gpio3 is set as input for the ADC instead of gpio2. + type: boolean + +allOf: + - if: + required: + - adi,gpio3-monitor-enable + then: + properties: + adi,gpio2-mode: + const: stress_fet + +required: + - compatible + - reg + - adi,rsense-nano-ohms + +additionalProperties: false + +examples: + - | + i2c { + #address-cells = <1>; + #size-cells = <0>; + + hwmon@50 { + compatible = "adi,ltc4282"; + reg = <0x50>; + adi,rsense-nano-ohms = <500>; + + adi,gpio1-mode = "power_good"; + adi,gpio2-mode = "adc_input"; + }; + }; +... diff --git a/MAINTAINERS b/MAINTAINERS index 5e7239cb40ea6a..3c5f7ae166f0f0 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -12759,6 +12759,12 @@ S: Maintained F: Documentation/hwmon/ltc4261.rst F: drivers/hwmon/ltc4261.c +LTC4282 HARDWARE MONITOR DRIVER +M: Nuno Sa +L: linux-hwmon@vger.kernel.org +S: Supported +F: Documentation/devicetree/bindings/hwmon/adi,ltc4282.yaml + LTC4286 HARDWARE MONITOR DRIVER M: Delphine CC Chiu L: linux-i2c@vger.kernel.org From 5b413e5322b62da39794187f3c5ebe3884a3ac27 Mon Sep 17 00:00:00 2001 From: Nuno Sa Date: Mon, 29 Jan 2024 17:13:24 +0100 Subject: [PATCH 386/707] hwmon: add fault attribute for voltage channels Sometimes a voltage channel might have an hard failure (eg: a shorted MOSFET). Hence, add a fault attribute to report such failures. Signed-off-by: Nuno Sa Link: https://lore.kernel.org/r/20240129-b4-ltc4282-support-v4-2-fe75798164cc@analog.com Signed-off-by: Guenter Roeck --- Documentation/ABI/testing/sysfs-class-hwmon | 9 +++++++++ drivers/hwmon/hwmon.c | 1 + include/linux/hwmon.h | 2 ++ 3 files changed, 12 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-class-hwmon b/Documentation/ABI/testing/sysfs-class-hwmon index 3dac923c9b0ef0..6c4e68ad4a8331 100644 --- a/Documentation/ABI/testing/sysfs-class-hwmon +++ b/Documentation/ABI/testing/sysfs-class-hwmon @@ -149,6 +149,15 @@ Description: RW +What: /sys/class/hwmon/hwmonX/inY_fault +Description: + Reports a voltage hard failure (eg: shorted component) + + - 1: Failed + - 0: Ok + + RO + What: /sys/class/hwmon/hwmonX/cpuY_vid Description: CPU core reference voltage. diff --git a/drivers/hwmon/hwmon.c b/drivers/hwmon/hwmon.c index c7dd3f5b2bd549..18705049ad610e 100644 --- a/drivers/hwmon/hwmon.c +++ b/drivers/hwmon/hwmon.c @@ -510,6 +510,7 @@ static const char * const hwmon_in_attr_templates[] = { [hwmon_in_rated_min] = "in%d_rated_min", [hwmon_in_rated_max] = "in%d_rated_max", [hwmon_in_beep] = "in%d_beep", + [hwmon_in_fault] = "in%d_fault", }; static const char * const hwmon_curr_attr_templates[] = { diff --git a/include/linux/hwmon.h b/include/linux/hwmon.h index c2c0da18dfa369..c7885fdce88f09 100644 --- a/include/linux/hwmon.h +++ b/include/linux/hwmon.h @@ -141,6 +141,7 @@ enum hwmon_in_attributes { hwmon_in_rated_min, hwmon_in_rated_max, hwmon_in_beep, + hwmon_in_fault, }; #define HWMON_I_ENABLE BIT(hwmon_in_enable) @@ -162,6 +163,7 @@ enum hwmon_in_attributes { #define HWMON_I_RATED_MIN BIT(hwmon_in_rated_min) #define HWMON_I_RATED_MAX BIT(hwmon_in_rated_max) #define HWMON_I_BEEP BIT(hwmon_in_beep) +#define HWMON_I_FAULT BIT(hwmon_in_fault) enum hwmon_curr_attributes { hwmon_curr_enable, From 6120fec68e78eefdf7d89325668c082a90817c75 Mon Sep 17 00:00:00 2001 From: Nuno Sa Date: Mon, 29 Jan 2024 17:13:25 +0100 Subject: [PATCH 387/707] hwmon: ltc4282: add support for the LTC4282 chip The LTC4282 hot swap controller allows a board to be safely inserted and removed from a live backplane. Using one or more external N-channel pass transistors, board supply voltage and inrush current are ramped up at an adjustable rate. An I2C interface and onboard ADC allows for monitoring of board current, voltage, power, energy and fault status. Signed-off-by: Nuno Sa Link: https://lore.kernel.org/r/20240129-b4-ltc4282-support-v4-3-fe75798164cc@analog.com Signed-off-by: Guenter Roeck --- Documentation/hwmon/index.rst | 1 + Documentation/hwmon/ltc4282.rst | 133 +++ MAINTAINERS | 2 + drivers/hwmon/Kconfig | 11 + drivers/hwmon/Makefile | 1 + drivers/hwmon/ltc4282.c | 1784 +++++++++++++++++++++++++++++++ 6 files changed, 1932 insertions(+) create mode 100644 Documentation/hwmon/ltc4282.rst create mode 100644 drivers/hwmon/ltc4282.c diff --git a/Documentation/hwmon/index.rst b/Documentation/hwmon/index.rst index c7ed1f73ac0661..f16c6dfaec7dc9 100644 --- a/Documentation/hwmon/index.rst +++ b/Documentation/hwmon/index.rst @@ -129,6 +129,7 @@ Hardware Monitoring Kernel Drivers ltc4245 ltc4260 ltc4261 + ltc4282 ltc4286 max127 max15301 diff --git a/Documentation/hwmon/ltc4282.rst b/Documentation/hwmon/ltc4282.rst new file mode 100644 index 00000000000000..a87ec3564998fe --- /dev/null +++ b/Documentation/hwmon/ltc4282.rst @@ -0,0 +1,133 @@ +.. SPDX-License-Identifier: GPL-2.0-only + +Kernel drivers ltc4282 +========================================== + +Supported chips: + + * Analog Devices LTC4282 + + Prefix: 'ltc4282' + + Addresses scanned: - I2C 0x40 - 0x5A (7-bit) + Addresses scanned: - I2C 0x80 - 0xB4 with a step of 2 (8-bit) + + Datasheet: + + https://www.analog.com/media/en/technical-documentation/data-sheets/ltc4282.pdf + +Author: Nuno Sá + +Description +___________ + +The LTC4282 hot swap controller allows a board to be safely inserted and removed +from a live backplane. Using one or more external N-channel pass transistors, +board supply voltage and inrush current are ramped up at an adjustable rate. An +I2C interface and onboard ADC allows for monitoring of board current, voltage, +power, energy and fault status. The device features analog foldback current +limiting and supply monitoring for applications from 2.9V to 33V. Dual 12V gate +drive allows high power applications to either share safe operating area across +parallel MOSFETs or support a 2-stage start-up that first charges the load +capacitance followed by enabling a low on-resistance path to the load. The +LTC4282 is well suited to high power applications because the precise monitoring +capability and accurate current limiting reduce the extremes in which both loads +and power supplies must safely operate. Non-volatile configuration allows for +flexibility in the autonomous generation of alerts and response to faults. + +Sysfs entries +_____________ + +The following attributes are supported. Limits are read-write and all the other +attributes are read-only. Note that in0 and in1 are mutually exclusive. Enabling +one disables the other and disabling one enables the other. + +======================= ========================================== +in0_input Output voltage (mV). +in0_min Undervoltage threshold +in0_max Overvoltage threshold +in0_lowest Lowest measured voltage +in0_highest Highest measured voltage +in0_reset_history Write 1 to reset in0 history. + Also clears fet bad and short fault logs. +in0_min_alarm Undervoltage alarm +in0_max_alarm Overvoltage alarm +in0_enable Enable/Disable VSOURCE monitoring +in0_fault Failure in the MOSFETs. Either bad or shorted FET. +in0_label Channel label (VSOURCE) + +in1_input Input voltage (mV). +in1_min Undervoltage threshold +in1_max Overvoltage threshold +in1_lowest Lowest measured voltage +in1_highest Highest measured voltage +in1_reset_history Write 1 to reset in1 history. + Also clears over/undervoltage fault logs. +in1_min_alarm Undervoltage alarm +in1_max_alarm Overvoltage alarm +in1_lcrit_alarm Critical Undervoltage alarm +in1_crit_alarm Critical Overvoltage alarm +in1_enable Enable/Disable VDD monitoring +in1_label Channel label (VDD) + +in2_input GPIO voltage (mV) +in2_min Undervoltage threshold +in2_max Overvoltage threshold +in2_lowest Lowest measured voltage +in2_highest Highest measured voltage +in2_reset_history Write 1 to reset in2 history +in2_min_alarm Undervoltage alarm +in2_max_alarm Overvoltage alarm +in2_label Channel label (VGPIO) + +curr1_input Sense current (mA) +curr1_min Undercurrent threshold +curr1_max Overcurrent threshold +curr1_lowest Lowest measured current +curr1_highest Highest measured current +curr1_reset_history Write 1 to reset curr1 history. + Also clears overcurrent fault logs. +curr1_min_alarm Undercurrent alarm +curr1_max_alarm Overcurrent alarm +curr1_crit_alarm Critical Overcurrent alarm +curr1_label Channel label (ISENSE) + +power1_input Power (in uW) +power1_min Low power threshold +power1_max High power threshold +power1_input_lowest Historical minimum power use +power1_input_highest Historical maximum power use +power1_reset_history Write 1 to reset power1 history. + Also clears power bad fault logs. +power1_min_alarm Low power alarm +power1_max_alarm High power alarm +power1_label Channel label (Power) + +energy1_input Measured energy over time (in microJoule) +energy1_enable Enable/Disable Energy accumulation +======================= ========================================== + +DebugFs entries +_______________ + +The chip also has a fault log register where failures can be logged. Hence, +as these are logging events, we give access to them in debugfs. Note that +even if some failure is detected in these logs, it does necessarily mean +that the failure is still present. As mentioned in the proper Sysfs entries, +these logs can be cleared by writing in the proper reset_history attribute. + +.. warning:: The debugfs interface is subject to change without notice + and is only available when the kernel is compiled with + ``CONFIG_DEBUG_FS`` defined. + +``/sys/kernel/debug/ltc4282-hwmon[X]/`` +contains the following attributes: + +======================= ========================================== +power1_bad_fault_log Set to 1 by a power1 bad fault occurring. +in0_fet_short_fault_log Set to 1 when the ADC detects a FET-short fault. +in0_fet_bad_fault_log Set to 1 when a FET-BAD fault occurs. +in1_crit_fault_log Set to 1 by a VDD overvoltage fault occurring. +in1_lcrit_fault_log Set to 1 by a VDD undervoltage fault occurring. +curr1_crit_fault_log Set to 1 by an overcurrent fault occurring. +======================= ========================================== diff --git a/MAINTAINERS b/MAINTAINERS index 3c5f7ae166f0f0..e49eea8b18066b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -12764,6 +12764,8 @@ M: Nuno Sa L: linux-hwmon@vger.kernel.org S: Supported F: Documentation/devicetree/bindings/hwmon/adi,ltc4282.yaml +F: Documentation/hwmon/ltc4282.rst +F: drivers/hwmon/ltc4282.c LTC4286 HARDWARE MONITOR DRIVER M: Delphine CC Chiu diff --git a/drivers/hwmon/Kconfig b/drivers/hwmon/Kconfig index a608264da87df8..f6160cc7007773 100644 --- a/drivers/hwmon/Kconfig +++ b/drivers/hwmon/Kconfig @@ -1038,6 +1038,17 @@ config SENSORS_LTC4261 This driver can also be built as a module. If so, the module will be called ltc4261. +config SENSORS_LTC4282 + tristate "Analog Devices LTC4282" + depends on I2C + select REGMAP_I2C + help + If you say yes here you get support for Analog Devices LTC4282 + High Current Hot Swap Controller I2C interface. + + This driver can also be built as a module. If so, the module will + be called ltc4282. + config SENSORS_LTQ_CPUTEMP bool "Lantiq cpu temperature sensor driver" depends on SOC_XWAY diff --git a/drivers/hwmon/Makefile b/drivers/hwmon/Makefile index 47be39af5c0381..8bfc422a29e532 100644 --- a/drivers/hwmon/Makefile +++ b/drivers/hwmon/Makefile @@ -136,6 +136,7 @@ obj-$(CONFIG_SENSORS_LTC4222) += ltc4222.o obj-$(CONFIG_SENSORS_LTC4245) += ltc4245.o obj-$(CONFIG_SENSORS_LTC4260) += ltc4260.o obj-$(CONFIG_SENSORS_LTC4261) += ltc4261.o +obj-$(CONFIG_SENSORS_LTC4282) += ltc4282.o obj-$(CONFIG_SENSORS_LTQ_CPUTEMP) += ltq-cputemp.o obj-$(CONFIG_SENSORS_MAX1111) += max1111.o obj-$(CONFIG_SENSORS_MAX127) += max127.o diff --git a/drivers/hwmon/ltc4282.c b/drivers/hwmon/ltc4282.c new file mode 100644 index 00000000000000..cf4c9cfec8b42b --- /dev/null +++ b/drivers/hwmon/ltc4282.c @@ -0,0 +1,1784 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Analog Devices LTC4282 I2C High Current Hot Swap Controller over I2C + * + * Copyright 2023 Analog Devices Inc. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define LTC4282_CTRL_LSB 0x00 + #define LTC4282_CTRL_OV_RETRY_MASK BIT(0) + #define LTC4282_CTRL_UV_RETRY_MASK BIT(1) + #define LTC4282_CTRL_OC_RETRY_MASK BIT(2) + #define LTC4282_CTRL_ON_ACTIVE_LOW_MASK BIT(5) + #define LTC4282_CTRL_ON_DELAY_MASK BIT(6) +#define LTC4282_CTRL_MSB 0x01 + #define LTC4282_CTRL_VIN_MODE_MASK GENMASK(1, 0) + #define LTC4282_CTRL_OV_MODE_MASK GENMASK(3, 2) + #define LTC4282_CTRL_UV_MODE_MASK GENMASK(5, 4) +#define LTC4282_FAULT_LOG 0x04 + #define LTC4282_OV_FAULT_MASK BIT(0) + #define LTC4282_UV_FAULT_MASK BIT(1) + #define LTC4282_VDD_FAULT_MASK \ + (LTC4282_OV_FAULT_MASK | LTC4282_UV_FAULT_MASK) + #define LTC4282_OC_FAULT_MASK BIT(2) + #define LTC4282_POWER_BAD_FAULT_MASK BIT(3) + #define LTC4282_FET_SHORT_FAULT_MASK BIT(5) + #define LTC4282_FET_BAD_FAULT_MASK BIT(6) + #define LTC4282_FET_FAILURE_FAULT_MASK \ + (LTC4282_FET_SHORT_FAULT_MASK | LTC4282_FET_BAD_FAULT_MASK) +#define LTC4282_ADC_ALERT_LOG 0x05 + #define LTC4282_GPIO_ALARM_L_MASK BIT(0) + #define LTC4282_GPIO_ALARM_H_MASK BIT(1) + #define LTC4282_VSOURCE_ALARM_L_MASK BIT(2) + #define LTC4282_VSOURCE_ALARM_H_MASK BIT(3) + #define LTC4282_VSENSE_ALARM_L_MASK BIT(4) + #define LTC4282_VSENSE_ALARM_H_MASK BIT(5) + #define LTC4282_POWER_ALARM_L_MASK BIT(6) + #define LTC4282_POWER_ALARM_H_MASK BIT(7) +#define LTC4282_FET_BAD_FAULT_TIMEOUT 0x06 + #define LTC4282_FET_BAD_MAX_TIMEOUT 255 +#define LTC4282_GPIO_CONFIG 0x07 + #define LTC4282_GPIO_2_FET_STRESS_MASK BIT(1) + #define LTC4282_GPIO_1_CONFIG_MASK GENMASK(5, 4) +#define LTC4282_VGPIO_MIN 0x08 +#define LTC4282_VGPIO_MAX 0x09 +#define LTC4282_VSOURCE_MIN 0x0a +#define LTC4282_VSOURCE_MAX 0x0b +#define LTC4282_VSENSE_MIN 0x0c +#define LTC4282_VSENSE_MAX 0x0d +#define LTC4282_POWER_MIN 0x0e +#define LTC4282_POWER_MAX 0x0f +#define LTC4282_CLK_DIV 0x10 + #define LTC4282_CLK_DIV_MASK GENMASK(4, 0) + #define LTC4282_CLKOUT_MASK GENMASK(6, 5) +#define LTC4282_ILIM_ADJUST 0x11 + #define LTC4282_GPIO_MODE_MASK BIT(1) + #define LTC4282_VDD_MONITOR_MASK BIT(2) + #define LTC4282_FOLDBACK_MODE_MASK GENMASK(4, 3) + #define LTC4282_ILIM_ADJUST_MASK GENMASK(7, 5) +#define LTC4282_ENERGY 0x12 +#define LTC4282_TIME_COUNTER 0x18 +#define LTC4282_ALERT_CTRL 0x1c + #define LTC4282_ALERT_OUT_MASK BIT(6) +#define LTC4282_ADC_CTRL 0x1d + #define LTC4282_FAULT_LOG_EN_MASK BIT(2) + #define LTC4282_METER_HALT_MASK BIT(5) + #define LTC4282_METER_RESET_MASK BIT(6) + #define LTC4282_RESET_MASK BIT(7) +#define LTC4282_STATUS_LSB 0x1e + #define LTC4282_OV_STATUS_MASK BIT(0) + #define LTC4282_UV_STATUS_MASK BIT(1) + #define LTC4282_VDD_STATUS_MASK \ + (LTC4282_OV_STATUS_MASK | LTC4282_UV_STATUS_MASK) + #define LTC4282_OC_STATUS_MASK BIT(2) + #define LTC4282_POWER_GOOD_MASK BIT(3) + #define LTC4282_FET_FAILURE_MASK GENMASK(6, 5) +#define LTC4282_STATUS_MSB 0x1f +#define LTC4282_RESERVED_1 0x32 +#define LTC4282_RESERVED_2 0x33 +#define LTC4282_VGPIO 0x34 +#define LTC4282_VGPIO_LOWEST 0x36 +#define LTC4282_VGPIO_HIGHEST 0x38 +#define LTC4282_VSOURCE 0x3a +#define LTC4282_VSOURCE_LOWEST 0x3c +#define LTC4282_VSOURCE_HIGHEST 0x3e +#define LTC4282_VSENSE 0x40 +#define LTC4282_VSENSE_LOWEST 0x42 +#define LTC4282_VSENSE_HIGHEST 0x44 +#define LTC4282_POWER 0x46 +#define LTC4282_POWER_LOWEST 0x48 +#define LTC4282_POWER_HIGHEST 0x4a +#define LTC4282_RESERVED_3 0x50 + +#define LTC4282_CLKIN_MIN (250 * KILO) +#define LTC4282_CLKIN_MAX (15500 * KILO) +#define LTC4282_CLKIN_RANGE (LTC4282_CLKIN_MAX - LTC4282_CLKIN_MIN + 1) +#define LTC4282_CLKOUT_SYSTEM (250 * KILO) +#define LTC4282_CLKOUT_CNV 15 + +enum { + LTC4282_CHAN_VSOURCE, + LTC4282_CHAN_VDD, + LTC4282_CHAN_VGPIO, +}; + +struct ltc4282_cache { + u32 in_max_raw; + u32 in_min_raw; + long in_highest; + long in_lowest; + bool en; +}; + +struct ltc4282_state { + struct regmap *map; + /* Protect against multiple accesses to the device registers */ + struct mutex lock; + struct clk_hw clk_hw; + /* + * Used to cache values for VDD/VSOURCE depending which will be used + * when hwmon is not enabled for that channel. Needed because they share + * the same registers. + */ + struct ltc4282_cache in0_1_cache[LTC4282_CHAN_VGPIO]; + u32 vsense_max; + long power_max; + u32 rsense; + u16 vdd; + u16 vfs_out; + bool energy_en; +}; + +enum { + LTC4282_CLKOUT_NONE, + LTC4282_CLKOUT_INT, + LTC4282_CLKOUT_TICK, +}; + +static int ltc4282_set_rate(struct clk_hw *hw, + unsigned long rate, unsigned long parent_rate) +{ + struct ltc4282_state *st = container_of(hw, struct ltc4282_state, + clk_hw); + u32 val = LTC4282_CLKOUT_INT; + + if (rate == LTC4282_CLKOUT_CNV) + val = LTC4282_CLKOUT_TICK; + + return regmap_update_bits(st->map, LTC4282_CLK_DIV, LTC4282_CLKOUT_MASK, + FIELD_PREP(LTC4282_CLKOUT_MASK, val)); +} + +/* + * Note the 15HZ conversion rate assumes 12bit ADC which is what we are + * supporting for now. + */ +static const unsigned int ltc4282_out_rates[] = { + LTC4282_CLKOUT_CNV, LTC4282_CLKOUT_SYSTEM +}; + +static long ltc4282_round_rate(struct clk_hw *hw, unsigned long rate, + unsigned long *parent_rate) +{ + int idx = find_closest(rate, ltc4282_out_rates, + ARRAY_SIZE(ltc4282_out_rates)); + + return ltc4282_out_rates[idx]; +} + +static unsigned long ltc4282_recalc_rate(struct clk_hw *hw, + unsigned long parent) +{ + struct ltc4282_state *st = container_of(hw, struct ltc4282_state, + clk_hw); + u32 clkdiv; + int ret; + + ret = regmap_read(st->map, LTC4282_CLK_DIV, &clkdiv); + if (ret) + return 0; + + clkdiv = FIELD_GET(LTC4282_CLKOUT_MASK, clkdiv); + if (!clkdiv) + return 0; + if (clkdiv == LTC4282_CLKOUT_INT) + return LTC4282_CLKOUT_SYSTEM; + + return LTC4282_CLKOUT_CNV; +} + +static void ltc4282_disable(struct clk_hw *clk_hw) +{ + struct ltc4282_state *st = container_of(clk_hw, struct ltc4282_state, + clk_hw); + + regmap_clear_bits(st->map, LTC4282_CLK_DIV, LTC4282_CLKOUT_MASK); +} + +static int ltc4282_read_voltage_word(const struct ltc4282_state *st, u32 reg, + u32 fs, long *val) +{ + __be16 in; + int ret; + + ret = regmap_bulk_read(st->map, reg, &in, sizeof(in)); + if (ret) + return ret; + + /* + * This is also used to calculate current in which case fs comes in + * 10 * uV. Hence the ULL usage. + */ + *val = DIV_ROUND_CLOSEST_ULL(be16_to_cpu(in) * (u64)fs, U16_MAX); + return 0; +} + +static int ltc4282_read_voltage_byte_cached(const struct ltc4282_state *st, + u32 reg, u32 fs, long *val, + u32 *cached_raw) +{ + int ret; + u32 in; + + if (cached_raw) { + in = *cached_raw; + } else { + ret = regmap_read(st->map, reg, &in); + if (ret) + return ret; + } + + *val = DIV_ROUND_CLOSEST(in * fs, U8_MAX); + return 0; +} + +static int ltc4282_read_voltage_byte(const struct ltc4282_state *st, u32 reg, + u32 fs, long *val) +{ + return ltc4282_read_voltage_byte_cached(st, reg, fs, val, NULL); +} + +static int __ltc4282_read_alarm(struct ltc4282_state *st, u32 reg, u32 mask, + long *val) +{ + u32 alarm; + int ret; + + ret = regmap_read(st->map, reg, &alarm); + if (ret) + return ret; + + *val = !!(alarm & mask); + + /* if not status/fault logs, clear the alarm after reading it */ + if (reg != LTC4282_STATUS_LSB && reg != LTC4282_FAULT_LOG) + return regmap_clear_bits(st->map, reg, mask); + + return 0; +} + +static int ltc4282_read_alarm(struct ltc4282_state *st, u32 reg, u32 mask, + long *val) +{ + guard(mutex)(&st->lock); + return __ltc4282_read_alarm(st, reg, mask, val); +} + +static int ltc4282_vdd_source_read_in(struct ltc4282_state *st, u32 channel, + long *val) +{ + guard(mutex)(&st->lock); + if (!st->in0_1_cache[channel].en) + return -ENODATA; + + return ltc4282_read_voltage_word(st, LTC4282_VSOURCE, st->vfs_out, val); +} + +static int ltc4282_vdd_source_read_hist(struct ltc4282_state *st, u32 reg, + u32 channel, long *cached, long *val) +{ + int ret; + + guard(mutex)(&st->lock); + if (!st->in0_1_cache[channel].en) { + *val = *cached; + return 0; + } + + ret = ltc4282_read_voltage_word(st, reg, st->vfs_out, val); + if (ret) + return ret; + + *cached = *val; + return 0; +} + +static int ltc4282_vdd_source_read_lim(struct ltc4282_state *st, u32 reg, + u32 channel, u32 *cached, long *val) +{ + guard(mutex)(&st->lock); + if (!st->in0_1_cache[channel].en) + return ltc4282_read_voltage_byte_cached(st, reg, st->vfs_out, + val, cached); + + return ltc4282_read_voltage_byte(st, reg, st->vfs_out, val); +} + +static int ltc4282_vdd_source_read_alm(struct ltc4282_state *st, u32 mask, + u32 channel, long *val) +{ + guard(mutex)(&st->lock); + if (!st->in0_1_cache[channel].en) { + /* + * Do this otherwise alarms can get confused because we clear + * them after reading them. So, if someone mistakenly reads + * VSOURCE right before VDD (or the other way around), we might + * get no alarm just because it was cleared when reading VSOURCE + * and had no time for a new conversion and thus having the + * alarm again. + */ + *val = 0; + return 0; + } + + return __ltc4282_read_alarm(st, LTC4282_ADC_ALERT_LOG, mask, val); +} + +static int ltc4282_read_in(struct ltc4282_state *st, u32 attr, long *val, + u32 channel) +{ + switch (attr) { + case hwmon_in_input: + if (channel == LTC4282_CHAN_VGPIO) + return ltc4282_read_voltage_word(st, LTC4282_VGPIO, + 1280, val); + + return ltc4282_vdd_source_read_in(st, channel, val); + case hwmon_in_highest: + if (channel == LTC4282_CHAN_VGPIO) + return ltc4282_read_voltage_word(st, + LTC4282_VGPIO_HIGHEST, + 1280, val); + + return ltc4282_vdd_source_read_hist(st, LTC4282_VSOURCE_HIGHEST, + channel, + &st->in0_1_cache[channel].in_highest, val); + case hwmon_in_lowest: + if (channel == LTC4282_CHAN_VGPIO) + return ltc4282_read_voltage_word(st, LTC4282_VGPIO_LOWEST, + 1280, val); + + return ltc4282_vdd_source_read_hist(st, LTC4282_VSOURCE_LOWEST, + channel, + &st->in0_1_cache[channel].in_lowest, val); + case hwmon_in_max_alarm: + if (channel == LTC4282_CHAN_VGPIO) + return ltc4282_read_alarm(st, LTC4282_ADC_ALERT_LOG, + LTC4282_GPIO_ALARM_H_MASK, + val); + + return ltc4282_vdd_source_read_alm(st, + LTC4282_VSOURCE_ALARM_H_MASK, + channel, val); + case hwmon_in_min_alarm: + if (channel == LTC4282_CHAN_VGPIO) + ltc4282_read_alarm(st, LTC4282_ADC_ALERT_LOG, + LTC4282_GPIO_ALARM_L_MASK, val); + + return ltc4282_vdd_source_read_alm(st, + LTC4282_VSOURCE_ALARM_L_MASK, + channel, val); + case hwmon_in_crit_alarm: + return ltc4282_read_alarm(st, LTC4282_STATUS_LSB, + LTC4282_OV_STATUS_MASK, val); + case hwmon_in_lcrit_alarm: + return ltc4282_read_alarm(st, LTC4282_STATUS_LSB, + LTC4282_UV_STATUS_MASK, val); + case hwmon_in_max: + if (channel == LTC4282_CHAN_VGPIO) + return ltc4282_read_voltage_byte(st, LTC4282_VGPIO_MAX, + 1280, val); + + return ltc4282_vdd_source_read_lim(st, LTC4282_VSOURCE_MAX, + channel, + &st->in0_1_cache[channel].in_max_raw, val); + case hwmon_in_min: + if (channel == LTC4282_CHAN_VGPIO) + return ltc4282_read_voltage_byte(st, LTC4282_VGPIO_MIN, + 1280, val); + + return ltc4282_vdd_source_read_lim(st, LTC4282_VSOURCE_MIN, + channel, + &st->in0_1_cache[channel].in_min_raw, val); + case hwmon_in_enable: + scoped_guard(mutex, &st->lock) { + *val = st->in0_1_cache[channel].en; + } + return 0; + case hwmon_in_fault: + /* + * We report failure if we detect either a fer_bad or a + * fet_short in the status register. + */ + return ltc4282_read_alarm(st, LTC4282_STATUS_LSB, + LTC4282_FET_FAILURE_MASK, val); + default: + return -EOPNOTSUPP; + } +} + +static int ltc4282_read_current_word(const struct ltc4282_state *st, u32 reg, + long *val) +{ + long in; + int ret; + + /* + * We pass in full scale in 10 * micro (note that 40 is already + * millivolt) so we have better approximations to calculate current. + */ + ret = ltc4282_read_voltage_word(st, reg, DECA * 40 * MILLI, &in); + if (ret) + return ret; + + *val = DIV_ROUND_CLOSEST(in * MILLI, st->rsense); + + return 0; +} + +static int ltc4282_read_current_byte(const struct ltc4282_state *st, u32 reg, + long *val) +{ + long in; + int ret; + + ret = ltc4282_read_voltage_byte(st, reg, DECA * 40 * MILLI, &in); + if (ret) + return ret; + + *val = DIV_ROUND_CLOSEST(in * MILLI, st->rsense); + + return 0; +} + +static int ltc4282_read_curr(struct ltc4282_state *st, const u32 attr, + long *val) +{ + switch (attr) { + case hwmon_curr_input: + return ltc4282_read_current_word(st, LTC4282_VSENSE, val); + case hwmon_curr_highest: + return ltc4282_read_current_word(st, LTC4282_VSENSE_HIGHEST, + val); + case hwmon_curr_lowest: + return ltc4282_read_current_word(st, LTC4282_VSENSE_LOWEST, + val); + case hwmon_curr_max: + return ltc4282_read_current_byte(st, LTC4282_VSENSE_MAX, val); + case hwmon_curr_min: + return ltc4282_read_current_byte(st, LTC4282_VSENSE_MIN, val); + case hwmon_curr_max_alarm: + return ltc4282_read_alarm(st, LTC4282_ADC_ALERT_LOG, + LTC4282_VSENSE_ALARM_H_MASK, val); + case hwmon_curr_min_alarm: + return ltc4282_read_alarm(st, LTC4282_ADC_ALERT_LOG, + LTC4282_VSENSE_ALARM_L_MASK, val); + case hwmon_curr_crit_alarm: + return ltc4282_read_alarm(st, LTC4282_STATUS_LSB, + LTC4282_OC_STATUS_MASK, val); + default: + return -EOPNOTSUPP; + } +} + +static int ltc4282_read_power_word(const struct ltc4282_state *st, u32 reg, + long *val) +{ + u64 temp = DECA * 40ULL * st->vfs_out * BIT(16), temp_2; + __be16 raw; + u16 power; + int ret; + + ret = regmap_bulk_read(st->map, reg, &raw, sizeof(raw)); + if (ret) + return ret; + + power = be16_to_cpu(raw); + /* + * Power is given by: + * P = CODE(16b) * 0.040 * Vfs(out) * 2^16 / ((2^16 - 1)^2 * Rsense) + */ + if (check_mul_overflow(power * temp, MICRO, &temp_2)) { + temp = DIV_ROUND_CLOSEST_ULL(power * temp, U16_MAX); + *val = DIV64_U64_ROUND_CLOSEST(temp * MICRO, + U16_MAX * (u64)st->rsense); + return 0; + } + + *val = DIV64_U64_ROUND_CLOSEST(temp_2, + st->rsense * int_pow(U16_MAX, 2)); + + return 0; +} + +static int ltc4282_read_power_byte(const struct ltc4282_state *st, u32 reg, + long *val) +{ + u32 power; + u64 temp; + int ret; + + ret = regmap_read(st->map, reg, &power); + if (ret) + return ret; + + temp = power * 40 * DECA * st->vfs_out * BIT_ULL(8); + *val = DIV64_U64_ROUND_CLOSEST(temp * MICRO, + int_pow(U8_MAX, 2) * st->rsense); + + return 0; +} + +static int ltc4282_read_energy(const struct ltc4282_state *st, u64 *val) +{ + u64 temp, energy; + __be64 raw; + int ret; + + ret = regmap_bulk_read(st->map, LTC4282_ENERGY, &raw, 6); + if (ret) + return ret; + + energy = be64_to_cpu(raw) >> 16; + /* + * The formula for energy is given by: + * E = CODE(48b) * 0.040 * Vfs(out) * Tconv * 256 / + * ((2^16 - 1)^2 * Rsense) + * + * Since we only support 12bit ADC, Tconv = 0.065535s. Passing Vfs(out) + * and 0.040 to mV and Tconv to us, we can simplify the formula to: + * E = CODE(48b) * 40 * Vfs(out) * 256 / (U16_MAX * Rsense) + * + * As Rsense can have tenths of micro-ohm resolution, we need to + * multiply by DECA to get microujoule. + */ + if (check_mul_overflow(DECA * st->vfs_out * 40 * BIT(8), energy, &temp)) { + temp = DIV_ROUND_CLOSEST(DECA * st->vfs_out * 40 * BIT(8), U16_MAX); + *val = DIV_ROUND_CLOSEST_ULL(temp * energy, st->rsense); + return 0; + } + + *val = DIV64_U64_ROUND_CLOSEST(temp, U16_MAX * (u64)st->rsense); + + return 0; +} + +static int ltc4282_read_power(struct ltc4282_state *st, const u32 attr, + long *val) +{ + switch (attr) { + case hwmon_power_input: + return ltc4282_read_power_word(st, LTC4282_POWER, val); + case hwmon_power_input_highest: + return ltc4282_read_power_word(st, LTC4282_POWER_HIGHEST, val); + case hwmon_power_input_lowest: + return ltc4282_read_power_word(st, LTC4282_POWER_LOWEST, val); + case hwmon_power_max_alarm: + return ltc4282_read_alarm(st, LTC4282_ADC_ALERT_LOG, + LTC4282_POWER_ALARM_H_MASK, val); + case hwmon_power_min_alarm: + return ltc4282_read_alarm(st, LTC4282_ADC_ALERT_LOG, + LTC4282_POWER_ALARM_L_MASK, val); + case hwmon_power_max: + return ltc4282_read_power_byte(st, LTC4282_POWER_MAX, val); + case hwmon_power_min: + return ltc4282_read_power_byte(st, LTC4282_POWER_MIN, val); + default: + return -EOPNOTSUPP; + } +} + +static int ltc4282_read(struct device *dev, enum hwmon_sensor_types type, + u32 attr, int channel, long *val) +{ + struct ltc4282_state *st = dev_get_drvdata(dev); + + switch (type) { + case hwmon_in: + return ltc4282_read_in(st, attr, val, channel); + case hwmon_curr: + return ltc4282_read_curr(st, attr, val); + case hwmon_power: + return ltc4282_read_power(st, attr, val); + case hwmon_energy: + scoped_guard(mutex, &st->lock) { + *val = st->energy_en; + } + return 0; + default: + return -EOPNOTSUPP; + } +} + +static int ltc4282_write_power_byte(const struct ltc4282_state *st, u32 reg, + long val) +{ + u32 power; + u64 temp; + + if (val > st->power_max) + val = st->power_max; + + temp = val * int_pow(U8_MAX, 2) * st->rsense; + power = DIV64_U64_ROUND_CLOSEST(temp, + MICRO * DECA * 256ULL * st->vfs_out * 40); + + return regmap_write(st->map, reg, power); +} + +static int ltc4282_write_power_word(const struct ltc4282_state *st, u32 reg, + long val) +{ + u64 temp = int_pow(U16_MAX, 2) * st->rsense, temp_2; + __be16 __raw; + u16 code; + + if (check_mul_overflow(temp, val, &temp_2)) { + temp = DIV_ROUND_CLOSEST_ULL(temp, DECA * MICRO); + code = DIV64_U64_ROUND_CLOSEST(temp * val, + 40ULL * BIT(16) * st->vfs_out); + } else { + temp = DECA * MICRO * 40ULL * BIT(16) * st->vfs_out; + code = DIV64_U64_ROUND_CLOSEST(temp_2, temp); + } + + __raw = cpu_to_be16(code); + return regmap_bulk_write(st->map, reg, &__raw, sizeof(__raw)); +} + +static int __ltc4282_in_write_history(const struct ltc4282_state *st, u32 reg, + long lowest, long highest, u32 fs) +{ + __be16 __raw; + u16 tmp; + int ret; + + tmp = DIV_ROUND_CLOSEST(U16_MAX * lowest, fs); + + __raw = cpu_to_be16(tmp); + + ret = regmap_bulk_write(st->map, reg, &__raw, 2); + if (ret) + return ret; + + tmp = DIV_ROUND_CLOSEST(U16_MAX * highest, fs); + + __raw = cpu_to_be16(tmp); + + return regmap_bulk_write(st->map, reg + 2, &__raw, 2); +} + +static int ltc4282_in_write_history(struct ltc4282_state *st, u32 reg, + long lowest, long highest, u32 fs) +{ + guard(mutex)(&st->lock); + return __ltc4282_in_write_history(st, reg, lowest, highest, fs); +} + +static int ltc4282_power_reset_hist(struct ltc4282_state *st) +{ + int ret; + + guard(mutex)(&st->lock); + + ret = ltc4282_write_power_word(st, LTC4282_POWER_LOWEST, + st->power_max); + if (ret) + return ret; + + ret = ltc4282_write_power_word(st, LTC4282_POWER_HIGHEST, 0); + if (ret) + return ret; + + /* now, let's also clear possible power_bad fault logs */ + return regmap_clear_bits(st->map, LTC4282_FAULT_LOG, + LTC4282_POWER_BAD_FAULT_MASK); +} + +static int ltc4282_write_power(struct ltc4282_state *st, u32 attr, + long val) +{ + switch (attr) { + case hwmon_power_max: + return ltc4282_write_power_byte(st, LTC4282_POWER_MAX, val); + case hwmon_power_min: + return ltc4282_write_power_byte(st, LTC4282_POWER_MIN, val); + case hwmon_power_reset_history: + return ltc4282_power_reset_hist(st); + default: + return -EOPNOTSUPP; + } +} + +static int ltc4282_write_voltage_byte_cached(const struct ltc4282_state *st, + u32 reg, u32 fs, long val, + u32 *cache_raw) +{ + u32 in; + + if (val >= fs) + in = U8_MAX; + else + in = DIV_ROUND_CLOSEST(val * U8_MAX, fs); + + if (cache_raw) { + *cache_raw = in; + return 0; + } + + return regmap_write(st->map, reg, in); +} + +static int ltc4282_write_voltage_byte(const struct ltc4282_state *st, u32 reg, + u32 fs, long val) +{ + return ltc4282_write_voltage_byte_cached(st, reg, fs, val, NULL); +} + +static int ltc4282_cache_history(struct ltc4282_state *st, u32 channel) +{ + long val; + int ret; + + ret = ltc4282_read_voltage_word(st, LTC4282_VSOURCE_LOWEST, st->vfs_out, + &val); + if (ret) + return ret; + + st->in0_1_cache[channel].in_lowest = val; + + ret = ltc4282_read_voltage_word(st, LTC4282_VSOURCE_HIGHEST, + st->vfs_out, &val); + if (ret) + return ret; + + st->in0_1_cache[channel].in_highest = val; + + ret = regmap_read(st->map, LTC4282_VSOURCE_MIN, + &st->in0_1_cache[channel].in_min_raw); + if (ret) + return ret; + + return regmap_read(st->map, LTC4282_VSOURCE_MAX, + &st->in0_1_cache[channel].in_max_raw); +} + +static int ltc4282_cache_sync(struct ltc4282_state *st, u32 channel) +{ + int ret; + + ret = __ltc4282_in_write_history(st, LTC4282_VSOURCE_LOWEST, + st->in0_1_cache[channel].in_lowest, + st->in0_1_cache[channel].in_highest, + st->vfs_out); + if (ret) + return ret; + + ret = regmap_write(st->map, LTC4282_VSOURCE_MIN, + st->in0_1_cache[channel].in_min_raw); + if (ret) + return ret; + + return regmap_write(st->map, LTC4282_VSOURCE_MAX, + st->in0_1_cache[channel].in_max_raw); +} + +static int ltc4282_vdd_source_write_lim(struct ltc4282_state *st, u32 reg, + int channel, u32 *cache, long val) +{ + int ret; + + guard(mutex)(&st->lock); + if (st->in0_1_cache[channel].en) + ret = ltc4282_write_voltage_byte(st, reg, st->vfs_out, val); + else + ret = ltc4282_write_voltage_byte_cached(st, reg, st->vfs_out, + val, cache); + + return ret; +} + +static int ltc4282_vdd_source_reset_hist(struct ltc4282_state *st, int channel) +{ + long lowest = st->vfs_out; + int ret; + + if (channel == LTC4282_CHAN_VDD) + lowest = st->vdd; + + guard(mutex)(&st->lock); + if (st->in0_1_cache[channel].en) { + ret = __ltc4282_in_write_history(st, LTC4282_VSOURCE_LOWEST, + lowest, 0, st->vfs_out); + if (ret) + return ret; + } + + st->in0_1_cache[channel].in_lowest = lowest; + st->in0_1_cache[channel].in_highest = 0; + + /* + * We are also clearing possible fault logs in reset_history. Clearing + * the logs might be important when the auto retry bits are not enabled + * as the chip only enables the output again after having these logs + * cleared. As some of these logs are related to limits, it makes sense + * to clear them in here. For VDD, we need to clear under/over voltage + * events. For VSOURCE, fet_short and fet_bad... + */ + if (channel == LTC4282_CHAN_VSOURCE) + return regmap_clear_bits(st->map, LTC4282_FAULT_LOG, + LTC4282_FET_FAILURE_FAULT_MASK); + + return regmap_clear_bits(st->map, LTC4282_FAULT_LOG, + LTC4282_VDD_FAULT_MASK); +} + +/* + * We need to mux between VSOURCE and VDD which means they are mutually + * exclusive. Moreover, we can't really disable both VDD and VSOURCE as the ADC + * is continuously running (we cannot independently halt it without also + * stopping VGPIO). Hence, the logic is that disabling or enabling VDD will + * automatically have the reverse effect on VSOURCE and vice-versa. + */ +static int ltc4282_vdd_source_enable(struct ltc4282_state *st, int channel, + long val) +{ + int ret, other_chan = ~channel & 0x1; + u8 __val = val; + + guard(mutex)(&st->lock); + if (st->in0_1_cache[channel].en == !!val) + return 0; + + /* clearing the bit makes the ADC to monitor VDD */ + if (channel == LTC4282_CHAN_VDD) + __val = !__val; + + ret = regmap_update_bits(st->map, LTC4282_ILIM_ADJUST, + LTC4282_VDD_MONITOR_MASK, + FIELD_PREP(LTC4282_VDD_MONITOR_MASK, !!__val)); + if (ret) + return ret; + + st->in0_1_cache[channel].en = !!val; + st->in0_1_cache[other_chan].en = !val; + + if (st->in0_1_cache[channel].en) { + /* + * Then, we are disabling @other_chan. Let's save it's current + * history. + */ + ret = ltc4282_cache_history(st, other_chan); + if (ret) + return ret; + + return ltc4282_cache_sync(st, channel); + } + /* + * Then, we are enabling @other_chan. We need to do the opposite from + * above. + */ + ret = ltc4282_cache_history(st, channel); + if (ret) + return ret; + + return ltc4282_cache_sync(st, other_chan); +} + +static int ltc4282_write_in(struct ltc4282_state *st, u32 attr, long val, + int channel) +{ + switch (attr) { + case hwmon_in_max: + if (channel == LTC4282_CHAN_VGPIO) + return ltc4282_write_voltage_byte(st, LTC4282_VGPIO_MAX, + 1280, val); + + return ltc4282_vdd_source_write_lim(st, LTC4282_VSOURCE_MAX, + channel, + &st->in0_1_cache[channel].in_max_raw, val); + case hwmon_in_min: + if (channel == LTC4282_CHAN_VGPIO) + return ltc4282_write_voltage_byte(st, LTC4282_VGPIO_MIN, + 1280, val); + + return ltc4282_vdd_source_write_lim(st, LTC4282_VSOURCE_MIN, + channel, + &st->in0_1_cache[channel].in_min_raw, val); + case hwmon_in_reset_history: + if (channel == LTC4282_CHAN_VGPIO) + return ltc4282_in_write_history(st, + LTC4282_VGPIO_LOWEST, + 1280, 0, 1280); + + return ltc4282_vdd_source_reset_hist(st, channel); + case hwmon_in_enable: + return ltc4282_vdd_source_enable(st, channel, val); + default: + return -EOPNOTSUPP; + } +} + +static int ltc4282_curr_reset_hist(struct ltc4282_state *st) +{ + int ret; + + guard(mutex)(&st->lock); + + ret = __ltc4282_in_write_history(st, LTC4282_VSENSE_LOWEST, + st->vsense_max, 0, 40 * MILLI); + if (ret) + return ret; + + /* now, let's also clear possible overcurrent fault logs */ + return regmap_clear_bits(st->map, LTC4282_FAULT_LOG, + LTC4282_OC_FAULT_MASK); +} + +static int ltc4282_write_curr(struct ltc4282_state *st, u32 attr, + long val) +{ + /* need to pass it in millivolt */ + u32 in = DIV_ROUND_CLOSEST_ULL((u64)val * st->rsense, DECA * MICRO); + + switch (attr) { + case hwmon_curr_max: + return ltc4282_write_voltage_byte(st, LTC4282_VSENSE_MAX, 40, + in); + case hwmon_curr_min: + return ltc4282_write_voltage_byte(st, LTC4282_VSENSE_MIN, 40, + in); + case hwmon_curr_reset_history: + return ltc4282_curr_reset_hist(st); + default: + return -EOPNOTSUPP; + } +} + +static int ltc4282_energy_enable_set(struct ltc4282_state *st, long val) +{ + int ret; + + guard(mutex)(&st->lock); + /* setting the bit halts the meter */ + ret = regmap_update_bits(st->map, LTC4282_ADC_CTRL, + LTC4282_METER_HALT_MASK, + FIELD_PREP(LTC4282_METER_HALT_MASK, !val)); + if (ret) + return ret; + + st->energy_en = !!val; + + return 0; +} + +static int ltc4282_write(struct device *dev, + enum hwmon_sensor_types type, + u32 attr, int channel, long val) +{ + struct ltc4282_state *st = dev_get_drvdata(dev); + + switch (type) { + case hwmon_power: + return ltc4282_write_power(st, attr, val); + case hwmon_in: + return ltc4282_write_in(st, attr, val, channel); + case hwmon_curr: + return ltc4282_write_curr(st, attr, val); + case hwmon_energy: + return ltc4282_energy_enable_set(st, val); + default: + return -EOPNOTSUPP; + } +} + +static umode_t ltc4282_in_is_visible(const struct ltc4282_state *st, u32 attr) +{ + switch (attr) { + case hwmon_in_input: + case hwmon_in_highest: + case hwmon_in_lowest: + case hwmon_in_max_alarm: + case hwmon_in_min_alarm: + case hwmon_in_label: + case hwmon_in_lcrit_alarm: + case hwmon_in_crit_alarm: + case hwmon_in_fault: + return 0444; + case hwmon_in_max: + case hwmon_in_min: + case hwmon_in_enable: + case hwmon_in_reset_history: + return 0644; + default: + return 0; + } +} + +static umode_t ltc4282_curr_is_visible(u32 attr) +{ + switch (attr) { + case hwmon_curr_input: + case hwmon_curr_highest: + case hwmon_curr_lowest: + case hwmon_curr_max_alarm: + case hwmon_curr_min_alarm: + case hwmon_curr_crit_alarm: + case hwmon_curr_label: + return 0444; + case hwmon_curr_max: + case hwmon_curr_min: + case hwmon_curr_reset_history: + return 0644; + default: + return 0; + } +} + +static umode_t ltc4282_power_is_visible(u32 attr) +{ + switch (attr) { + case hwmon_power_input: + case hwmon_power_input_highest: + case hwmon_power_input_lowest: + case hwmon_power_label: + case hwmon_power_max_alarm: + case hwmon_power_min_alarm: + return 0444; + case hwmon_power_max: + case hwmon_power_min: + case hwmon_power_reset_history: + return 0644; + default: + return 0; + } +} + +static umode_t ltc4282_is_visible(const void *data, + enum hwmon_sensor_types type, + u32 attr, int channel) +{ + switch (type) { + case hwmon_in: + return ltc4282_in_is_visible(data, attr); + case hwmon_curr: + return ltc4282_curr_is_visible(attr); + case hwmon_power: + return ltc4282_power_is_visible(attr); + case hwmon_energy: + /* hwmon_energy_enable */ + return 0644; + default: + return 0; + } +} + +static const char * const ltc4282_in_strs[] = { + "VSOURCE", "VDD", "VGPIO" +}; + +static int ltc4282_read_labels(struct device *dev, + enum hwmon_sensor_types type, + u32 attr, int channel, const char **str) +{ + switch (type) { + case hwmon_in: + *str = ltc4282_in_strs[channel]; + return 0; + case hwmon_curr: + *str = "ISENSE"; + return 0; + case hwmon_power: + *str = "Power"; + return 0; + default: + return -EOPNOTSUPP; + } +} + +static ssize_t ltc4282_energy_show(struct device *dev, + struct device_attribute *da, char *buf) +{ + struct ltc4282_state *st = dev_get_drvdata(dev); + u64 energy; + int ret; + + guard(mutex)(&st->lock); + if (!st->energy_en) + return -ENODATA; + + ret = ltc4282_read_energy(st, &energy); + if (ret < 0) + return ret; + + return sysfs_emit(buf, "%llu\n", energy); +} + +static const struct clk_ops ltc4282_ops = { + .recalc_rate = ltc4282_recalc_rate, + .round_rate = ltc4282_round_rate, + .set_rate = ltc4282_set_rate, + .disable = ltc4282_disable, +}; + +static int ltc428_clk_provider_setup(struct ltc4282_state *st, + struct device *dev) +{ + struct clk_init_data init; + int ret; + + if (!IS_ENABLED(CONFIG_COMMON_CLK)) + return 0; + + init.name = devm_kasprintf(dev, GFP_KERNEL, "%s-clk", + fwnode_get_name(dev_fwnode(dev))); + if (!init.name) + return -ENOMEM; + + init.ops = <c4282_ops; + init.flags = CLK_GET_RATE_NOCACHE; + st->clk_hw.init = &init; + + ret = devm_clk_hw_register(dev, &st->clk_hw); + if (ret) + return ret; + + return devm_of_clk_add_hw_provider(dev, of_clk_hw_simple_get, + &st->clk_hw); +} + +static int ltc428_clks_setup(struct ltc4282_state *st, struct device *dev) +{ + unsigned long rate; + struct clk *clkin; + u32 val; + int ret; + + ret = ltc428_clk_provider_setup(st, dev); + if (ret) + return ret; + + clkin = devm_clk_get_optional_enabled(dev, NULL); + if (IS_ERR(clkin)) + return dev_err_probe(dev, PTR_ERR(clkin), + "Failed to get clkin"); + if (!clkin) + return 0; + + rate = clk_get_rate(clkin); + if (!in_range(rate, LTC4282_CLKIN_MIN, LTC4282_CLKIN_RANGE)) + return dev_err_probe(dev, -EINVAL, + "Invalid clkin range(%lu) [%lu %lu]\n", + rate, LTC4282_CLKIN_MIN, + LTC4282_CLKIN_MAX); + + /* + * Clocks faster than 250KHZ should be reduced to 250KHZ. The clock + * frequency is divided by twice the value in the register. + */ + val = rate / (2 * LTC4282_CLKIN_MIN); + + return regmap_update_bits(st->map, LTC4282_CLK_DIV, + LTC4282_CLK_DIV_MASK, + FIELD_PREP(LTC4282_CLK_DIV_MASK, val)); +} + +static const int ltc4282_curr_lim_uv[] = { + 12500, 15625, 18750, 21875, 25000, 28125, 31250, 34375 +}; + +static int ltc4282_get_defaults(struct ltc4282_state *st, u32 *vin_mode) +{ + u32 reg_val, ilm_adjust; + int ret; + + ret = regmap_read(st->map, LTC4282_ADC_CTRL, ®_val); + if (ret) + return ret; + + st->energy_en = !FIELD_GET(LTC4282_METER_HALT_MASK, reg_val); + + ret = regmap_read(st->map, LTC4282_CTRL_MSB, ®_val); + if (ret) + return ret; + + *vin_mode = FIELD_GET(LTC4282_CTRL_VIN_MODE_MASK, reg_val); + + ret = regmap_read(st->map, LTC4282_ILIM_ADJUST, ®_val); + if (ret) + return ret; + + ilm_adjust = FIELD_GET(LTC4282_ILIM_ADJUST_MASK, reg_val); + st->vsense_max = ltc4282_curr_lim_uv[ilm_adjust]; + + st->in0_1_cache[LTC4282_CHAN_VSOURCE].en = FIELD_GET(LTC4282_VDD_MONITOR_MASK, + ilm_adjust); + if (!st->in0_1_cache[LTC4282_CHAN_VSOURCE].en) { + st->in0_1_cache[LTC4282_CHAN_VDD].en = true; + return regmap_read(st->map, LTC4282_VSOURCE_MAX, + &st->in0_1_cache[LTC4282_CHAN_VSOURCE].in_max_raw); + } + + return regmap_read(st->map, LTC4282_VSOURCE_MAX, + &st->in0_1_cache[LTC4282_CHAN_VDD].in_max_raw); +} + +/* + * Set max limits for ISENSE and Power as that depends on the max voltage on + * rsense that is defined in ILIM_ADJUST. This is specially important for power + * because for some rsense and vfsout values, if we allow the default raw 255 + * value, that would overflow long in 32bit archs when reading back the max + * power limit. + * + * Also set meaningful historic values for VDD and VSOURCE + * (0 would not mean much). + */ +static int ltc4282_set_max_limits(struct ltc4282_state *st) +{ + int ret; + + ret = ltc4282_write_voltage_byte(st, LTC4282_VSENSE_MAX, 40 * MILLI, + st->vsense_max); + if (ret) + return ret; + + /* Power is given by ISENSE * Vout. */ + st->power_max = DIV_ROUND_CLOSEST(st->vsense_max * DECA * MILLI, st->rsense) * st->vfs_out; + ret = ltc4282_write_power_byte(st, LTC4282_POWER_MAX, st->power_max); + if (ret) + return ret; + + if (st->in0_1_cache[LTC4282_CHAN_VDD].en) { + st->in0_1_cache[LTC4282_CHAN_VSOURCE].in_lowest = st->vfs_out; + return __ltc4282_in_write_history(st, LTC4282_VSOURCE_LOWEST, + st->vdd, 0, st->vfs_out); + } + + st->in0_1_cache[LTC4282_CHAN_VDD].in_lowest = st->vdd; + return __ltc4282_in_write_history(st, LTC4282_VSOURCE_LOWEST, + st->vfs_out, 0, st->vfs_out); +} + +static const char * const ltc4282_gpio1_modes[] = { + "power_bad", "power_good" +}; + +static const char * const ltc4282_gpio2_modes[] = { + "adc_input", "stress_fet" +}; + +static int ltc4282_gpio_setup(struct ltc4282_state *st, struct device *dev) +{ + const char *func = NULL; + int ret; + + ret = device_property_read_string(dev, "adi,gpio1-mode", &func); + if (!ret) { + ret = match_string(ltc4282_gpio1_modes, + ARRAY_SIZE(ltc4282_gpio1_modes), func); + if (ret < 0) + return dev_err_probe(dev, ret, + "Invalid func(%s) for gpio1\n", + func); + + ret = regmap_update_bits(st->map, LTC4282_GPIO_CONFIG, + LTC4282_GPIO_1_CONFIG_MASK, + FIELD_PREP(LTC4282_GPIO_1_CONFIG_MASK, ret)); + if (ret) + return ret; + } + + ret = device_property_read_string(dev, "adi,gpio2-mode", &func); + if (!ret) { + ret = match_string(ltc4282_gpio2_modes, + ARRAY_SIZE(ltc4282_gpio2_modes), func); + if (ret < 0) + return dev_err_probe(dev, ret, + "Invalid func(%s) for gpio2\n", + func); + if (!ret) { + /* setting the bit to 1 so the ADC to monitors GPIO2 */ + ret = regmap_set_bits(st->map, LTC4282_ILIM_ADJUST, + LTC4282_GPIO_MODE_MASK); + } else { + ret = regmap_update_bits(st->map, LTC4282_GPIO_CONFIG, + LTC4282_GPIO_2_FET_STRESS_MASK, + FIELD_PREP(LTC4282_GPIO_2_FET_STRESS_MASK, 1)); + } + + if (ret) + return ret; + } + + if (!device_property_read_bool(dev, "adi,gpio3-monitor-enable")) + return 0; + + if (func && !strcmp(func, "adc_input")) + return dev_err_probe(dev, -EINVAL, + "Cannot have both gpio2 and gpio3 muxed into the ADC"); + + return regmap_clear_bits(st->map, LTC4282_ILIM_ADJUST, + LTC4282_GPIO_MODE_MASK); +} + +static const char * const ltc4282_dividers[] = { + "external", "vdd_5_percent", "vdd_10_percent", "vdd_15_percent" +}; + +/* This maps the Vout full scale for the given Vin mode */ +static const u16 ltc4282_vfs_milli[] = { 5540, 8320, 16640, 33280 }; + +static const u16 ltc4282_vdd_milli[] = { 3300, 5000, 12000, 24000 }; + +enum { + LTC4282_VIN_3_3V, + LTC4282_VIN_5V, + LTC4282_VIN_12V, + LTC4282_VIN_24V, +}; + +static int ltc4282_setup(struct ltc4282_state *st, struct device *dev) +{ + const char *divider; + u32 val, vin_mode; + int ret; + + /* The part has an eeprom so let's get the needed defaults from it */ + ret = ltc4282_get_defaults(st, &vin_mode); + if (ret) + return ret; + + ret = device_property_read_u32(dev, "adi,rsense-nano-ohms", + &st->rsense); + if (ret) + return dev_err_probe(dev, ret, + "Failed to read adi,rsense-nano-ohms\n"); + if (st->rsense < CENTI) + return dev_err_probe(dev, -EINVAL, + "adi,rsense-nano-ohms too small (< %lu)\n", + CENTI); + + /* + * The resolution for rsense is tenths of micro (eg: 62.5 uOhm) which + * means we need nano in the bindings. However, to make things easier to + * handle (with respect to overflows) we divide it by 100 as we don't + * really need the last two digits. + */ + st->rsense /= CENTI; + + val = vin_mode; + ret = device_property_read_u32(dev, "adi,vin-mode-microvolt", &val); + if (!ret) { + switch (val) { + case 3300000: + val = LTC4282_VIN_3_3V; + break; + case 5000000: + val = LTC4282_VIN_5V; + break; + case 12000000: + val = LTC4282_VIN_12V; + break; + case 24000000: + val = LTC4282_VIN_24V; + break; + default: + return dev_err_probe(dev, -EINVAL, + "Invalid val(%u) for vin-mode-microvolt\n", + val); + } + + ret = regmap_update_bits(st->map, LTC4282_CTRL_MSB, + LTC4282_CTRL_VIN_MODE_MASK, + FIELD_PREP(LTC4282_CTRL_VIN_MODE_MASK, val)); + if (ret) + return ret; + + /* Foldback mode should also be set to the input voltage */ + ret = regmap_update_bits(st->map, LTC4282_ILIM_ADJUST, + LTC4282_FOLDBACK_MODE_MASK, + FIELD_PREP(LTC4282_FOLDBACK_MODE_MASK, val)); + if (ret) + return ret; + } + + st->vfs_out = ltc4282_vfs_milli[val]; + st->vdd = ltc4282_vdd_milli[val]; + + ret = device_property_read_u32(dev, "adi,current-limit-sense-microvolt", + &st->vsense_max); + if (!ret) { + int reg_val; + + switch (val) { + case 12500: + reg_val = 0; + break; + case 15625: + reg_val = 1; + break; + case 18750: + reg_val = 2; + break; + case 21875: + reg_val = 3; + break; + case 25000: + reg_val = 4; + break; + case 28125: + reg_val = 5; + break; + case 31250: + reg_val = 6; + break; + case 34375: + reg_val = 7; + break; + default: + return dev_err_probe(dev, -EINVAL, + "Invalid val(%u) for adi,current-limit-microvolt\n", + st->vsense_max); + } + + ret = regmap_update_bits(st->map, LTC4282_ILIM_ADJUST, + LTC4282_ILIM_ADJUST_MASK, + FIELD_PREP(LTC4282_ILIM_ADJUST_MASK, reg_val)); + if (ret) + return ret; + } + + ret = ltc4282_set_max_limits(st); + if (ret) + return ret; + + ret = device_property_read_string(dev, "adi,overvoltage-dividers", + ÷r); + if (!ret) { + int div = match_string(ltc4282_dividers, + ARRAY_SIZE(ltc4282_dividers), divider); + if (div < 0) + return dev_err_probe(dev, -EINVAL, + "Invalid val(%s) for adi,overvoltage-divider\n", + divider); + + ret = regmap_update_bits(st->map, LTC4282_CTRL_MSB, + LTC4282_CTRL_OV_MODE_MASK, + FIELD_PREP(LTC4282_CTRL_OV_MODE_MASK, div)); + } + + ret = device_property_read_string(dev, "adi,undervoltage-dividers", + ÷r); + if (!ret) { + int div = match_string(ltc4282_dividers, + ARRAY_SIZE(ltc4282_dividers), divider); + if (div < 0) + return dev_err_probe(dev, -EINVAL, + "Invalid val(%s) for adi,undervoltage-divider\n", + divider); + + ret = regmap_update_bits(st->map, LTC4282_CTRL_MSB, + LTC4282_CTRL_UV_MODE_MASK, + FIELD_PREP(LTC4282_CTRL_UV_MODE_MASK, div)); + } + + if (device_property_read_bool(dev, "adi,overcurrent-retry")) { + ret = regmap_set_bits(st->map, LTC4282_CTRL_LSB, + LTC4282_CTRL_OC_RETRY_MASK); + if (ret) + return ret; + } + + if (device_property_read_bool(dev, "adi,overvoltage-retry-disable")) { + ret = regmap_clear_bits(st->map, LTC4282_CTRL_LSB, + LTC4282_CTRL_OV_RETRY_MASK); + if (ret) + return ret; + } + + if (device_property_read_bool(dev, "adi,undervoltage-retry-disable")) { + ret = regmap_clear_bits(st->map, LTC4282_CTRL_LSB, + LTC4282_CTRL_UV_RETRY_MASK); + if (ret) + return ret; + } + + if (device_property_read_bool(dev, "adi,fault-log-enable")) { + ret = regmap_set_bits(st->map, LTC4282_ADC_CTRL, + LTC4282_FAULT_LOG_EN_MASK); + if (ret) + return ret; + } + + if (device_property_read_bool(dev, "adi,fault-log-enable")) { + ret = regmap_set_bits(st->map, LTC4282_ADC_CTRL, LTC4282_FAULT_LOG_EN_MASK); + if (ret) + return ret; + } + + ret = device_property_read_u32(dev, "adi,fet-bad-timeout-ms", &val); + if (!ret) { + if (val > LTC4282_FET_BAD_MAX_TIMEOUT) + return dev_err_probe(dev, -EINVAL, + "Invalid value(%u) for adi,fet-bad-timeout-ms", + val); + + ret = regmap_write(st->map, LTC4282_FET_BAD_FAULT_TIMEOUT, val); + if (ret) + return ret; + } + + return ltc4282_gpio_setup(st, dev); +} + +static bool ltc4282_readable_reg(struct device *dev, unsigned int reg) +{ + if (reg == LTC4282_RESERVED_1 || reg == LTC4282_RESERVED_2) + return false; + + return true; +} + +static bool ltc4282_writable_reg(struct device *dev, unsigned int reg) +{ + if (reg == LTC4282_STATUS_LSB || reg == LTC4282_STATUS_MSB) + return false; + if (reg == LTC4282_RESERVED_1 || reg == LTC4282_RESERVED_2) + return false; + + return true; +} + +static const struct regmap_config ltc4282_regmap_config = { + .reg_bits = 8, + .val_bits = 8, + .max_register = LTC4282_RESERVED_3, + .readable_reg = ltc4282_readable_reg, + .writeable_reg = ltc4282_writable_reg, +}; + +static const struct hwmon_channel_info * const ltc4282_info[] = { + HWMON_CHANNEL_INFO(in, + HWMON_I_INPUT | HWMON_I_LOWEST | HWMON_I_HIGHEST | + HWMON_I_MAX | HWMON_I_MIN | HWMON_I_MIN_ALARM | + HWMON_I_MAX_ALARM | HWMON_I_ENABLE | + HWMON_I_RESET_HISTORY | HWMON_I_FAULT | + HWMON_I_LABEL, + HWMON_I_INPUT | HWMON_I_LOWEST | HWMON_I_HIGHEST | + HWMON_I_MAX | HWMON_I_MIN | HWMON_I_MIN_ALARM | + HWMON_I_MAX_ALARM | HWMON_I_LCRIT_ALARM | + HWMON_I_CRIT_ALARM | HWMON_I_ENABLE | + HWMON_I_RESET_HISTORY | HWMON_I_LABEL, + HWMON_I_INPUT | HWMON_I_LOWEST | HWMON_I_HIGHEST | + HWMON_I_MAX | HWMON_I_MIN | HWMON_I_MIN_ALARM | + HWMON_I_RESET_HISTORY | HWMON_I_MAX_ALARM | + HWMON_I_LABEL), + HWMON_CHANNEL_INFO(curr, + HWMON_C_INPUT | HWMON_C_LOWEST | HWMON_C_HIGHEST | + HWMON_C_MAX | HWMON_C_MIN | HWMON_C_MIN_ALARM | + HWMON_C_MAX_ALARM | HWMON_C_CRIT_ALARM | + HWMON_C_RESET_HISTORY | HWMON_C_LABEL), + HWMON_CHANNEL_INFO(power, + HWMON_P_INPUT | HWMON_P_INPUT_LOWEST | + HWMON_P_INPUT_HIGHEST | HWMON_P_MAX | HWMON_P_MIN | + HWMON_P_MAX_ALARM | HWMON_P_MIN_ALARM | + HWMON_P_RESET_HISTORY | HWMON_P_LABEL), + HWMON_CHANNEL_INFO(energy, + HWMON_E_ENABLE), + NULL +}; + +static const struct hwmon_ops ltc4282_hwmon_ops = { + .read = ltc4282_read, + .write = ltc4282_write, + .is_visible = ltc4282_is_visible, + .read_string = ltc4282_read_labels, +}; + +static const struct hwmon_chip_info ltc2947_chip_info = { + .ops = <c4282_hwmon_ops, + .info = ltc4282_info, +}; + +/* energy attributes are 6bytes wide so we need u64 */ +static SENSOR_DEVICE_ATTR_RO(energy1_input, ltc4282_energy, 0); + +static struct attribute *ltc4282_attrs[] = { + &sensor_dev_attr_energy1_input.dev_attr.attr, + NULL +}; +ATTRIBUTE_GROUPS(ltc4282); + +static int ltc4282_show_fault_log(void *arg, u64 *val, u32 mask) +{ + struct ltc4282_state *st = arg; + long alarm; + int ret; + + ret = ltc4282_read_alarm(st, LTC4282_FAULT_LOG, mask, &alarm); + if (ret) + return ret; + + *val = alarm; + + return 0; +} + +static int ltc4282_show_curr1_crit_fault_log(void *arg, u64 *val) +{ + return ltc4282_show_fault_log(arg, val, LTC4282_OC_FAULT_MASK); +} +DEFINE_DEBUGFS_ATTRIBUTE(ltc4282_curr1_crit_fault_log, + ltc4282_show_curr1_crit_fault_log, NULL, "%llu\n"); + +static int ltc4282_show_in1_lcrit_fault_log(void *arg, u64 *val) +{ + return ltc4282_show_fault_log(arg, val, LTC4282_UV_FAULT_MASK); +} +DEFINE_DEBUGFS_ATTRIBUTE(ltc4282_in1_lcrit_fault_log, + ltc4282_show_in1_lcrit_fault_log, NULL, "%llu\n"); + +static int ltc4282_show_in1_crit_fault_log(void *arg, u64 *val) +{ + return ltc4282_show_fault_log(arg, val, LTC4282_OV_FAULT_MASK); +} +DEFINE_DEBUGFS_ATTRIBUTE(ltc4282_in1_crit_fault_log, + ltc4282_show_in1_crit_fault_log, NULL, "%llu\n"); + +static int ltc4282_show_fet_bad_fault_log(void *arg, u64 *val) +{ + return ltc4282_show_fault_log(arg, val, LTC4282_FET_BAD_FAULT_MASK); +} +DEFINE_DEBUGFS_ATTRIBUTE(ltc4282_fet_bad_fault_log, + ltc4282_show_fet_bad_fault_log, NULL, "%llu\n"); + +static int ltc4282_show_fet_short_fault_log(void *arg, u64 *val) +{ + return ltc4282_show_fault_log(arg, val, LTC4282_FET_SHORT_FAULT_MASK); +} +DEFINE_DEBUGFS_ATTRIBUTE(ltc4282_fet_short_fault_log, + ltc4282_show_fet_short_fault_log, NULL, "%llu\n"); + +static int ltc4282_show_power1_bad_fault_log(void *arg, u64 *val) +{ + return ltc4282_show_fault_log(arg, val, LTC4282_POWER_BAD_FAULT_MASK); +} +DEFINE_DEBUGFS_ATTRIBUTE(ltc4282_power1_bad_fault_log, + ltc4282_show_power1_bad_fault_log, NULL, "%llu\n"); + +static void ltc4282_debugfs_remove(void *dir) +{ + debugfs_remove_recursive(dir); +} + +static void ltc4282_debugfs_init(struct ltc4282_state *st, + struct i2c_client *i2c, + const struct device *hwmon) +{ + const char *debugfs_name; + struct dentry *dentry; + int ret; + + if (!IS_ENABLED(CONFIG_DEBUG_FS)) + return; + + debugfs_name = devm_kasprintf(&i2c->dev, GFP_KERNEL, "ltc4282-%s", + dev_name(hwmon)); + if (!debugfs_name) + return; + + dentry = debugfs_create_dir(debugfs_name, NULL); + if (IS_ERR(dentry)) + return; + + ret = devm_add_action_or_reset(&i2c->dev, ltc4282_debugfs_remove, + dentry); + if (ret) + return; + + debugfs_create_file_unsafe("power1_bad_fault_log", 0400, dentry, st, + <c4282_power1_bad_fault_log); + debugfs_create_file_unsafe("in0_fet_short_fault_log", 0400, dentry, st, + <c4282_fet_short_fault_log); + debugfs_create_file_unsafe("in0_fet_bad_fault_log", 0400, dentry, st, + <c4282_fet_bad_fault_log); + debugfs_create_file_unsafe("in1_crit_fault_log", 0400, dentry, st, + <c4282_in1_crit_fault_log); + debugfs_create_file_unsafe("in1_lcrit_fault_log", 0400, dentry, st, + <c4282_in1_lcrit_fault_log); + debugfs_create_file_unsafe("curr1_crit_fault_log", 0400, dentry, st, + <c4282_curr1_crit_fault_log); +} + +static int ltc4282_probe(struct i2c_client *i2c) +{ + struct device *dev = &i2c->dev, *hwmon; + struct ltc4282_state *st; + int ret; + + st = devm_kzalloc(dev, sizeof(*st), GFP_KERNEL); + if (!st) + return dev_err_probe(dev, -ENOMEM, + "Failed to allocate memory\n"); + + st->map = devm_regmap_init_i2c(i2c, <c4282_regmap_config); + if (IS_ERR(st->map)) + return dev_err_probe(dev, PTR_ERR(st->map), + "failed regmap init\n"); + + /* Soft reset */ + ret = regmap_set_bits(st->map, LTC4282_ADC_CTRL, LTC4282_RESET_MASK); + if (ret) + return ret; + + /* Yes, it's big but it is as specified in the datasheet */ + msleep(3200); + + ret = ltc428_clks_setup(st, dev); + if (ret) + return ret; + + ret = ltc4282_setup(st, dev); + if (ret) + return ret; + + mutex_init(&st->lock); + hwmon = devm_hwmon_device_register_with_info(dev, "ltc4282", st, + <c2947_chip_info, + ltc4282_groups); + if (IS_ERR(hwmon)) + return PTR_ERR(hwmon); + + ltc4282_debugfs_init(st, i2c, hwmon); + + return 0; +} + +static const struct of_device_id ltc4282_of_match[] = { + { .compatible = "adi,ltc4282" }, + {} +}; +MODULE_DEVICE_TABLE(of, ltc4282_of_match); + +static struct i2c_driver ltc4282_driver = { + .driver = { + .name = "ltc4282", + .of_match_table = ltc4282_of_match, + }, + .probe = ltc4282_probe, +}; +module_i2c_driver(ltc4282_driver); + +MODULE_AUTHOR("Nuno Sa "); +MODULE_DESCRIPTION("LTC4282 I2C High Current Hot Swap Controller"); +MODULE_LICENSE("GPL"); From 7b7cd4c30033a19ea5dfaae7bfe92fe93d749ee1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 10 Jan 2024 10:21:08 +0100 Subject: [PATCH 388/707] mm: add a mapping_clear_large_folios helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "disable large folios for shmem file used by xfs xfile". Darrick reported that the fairly new XFS xfile code blows up when force enabling large folio for shmem. This series fixes this quickly by disabling large folios for this particular shmem file for now until it can be fixed properly, which will be a lot more invasive. This patch (of 2): Users of shmem_kernel_file_setup might not be able to deal with large folios (yet). Give them a way to disable large folio support on their mapping. Link: https://lkml.kernel.org/r/20240110092109.1950011-1-hch@lst.de Link: https://lkml.kernel.org/r/20240110092109.1950011-2-hch@lst.de Fixes: 3934e8ebb7cc ("xfs: create a big array data structure") Signed-off-by: Christoph Hellwig Cc: Chandan Babu R Cc: Christian König Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Airlie Cc: Dave Hansen Cc: David Howells Cc: Huang Rui Cc: Hugh Dickins Cc: Jani Nikula Cc: Jarkko Sakkinen Cc: Joonas Lahtinen Cc: Maarten Lankhorst Cc: Matthew Wilcox (Oracle) Cc: Maxime Ripard Cc: Rodrigo Vivi Cc: Thomas Zimmermann Cc: Tvrtko Ursulin Cc: Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 2df35e65557d27..431b12a23299d8 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -360,6 +360,20 @@ static inline void mapping_set_large_folios(struct address_space *mapping) __set_bit(AS_LARGE_FOLIO_SUPPORT, &mapping->flags); } +/** + * mapping_clear_large_folios() - Disable large folio support for a mapping + * @mapping: The mapping. + * + * This can be called to undo the effect of mapping_set_large_folios(). + * + * Context: This should not be called while the inode is active as it + * is non-atomic. + */ +static inline void mapping_clear_large_folios(struct address_space *mapping) +{ + __clear_bit(AS_LARGE_FOLIO_SUPPORT, &mapping->flags); +} + /* * Large folio support currently depends on THP. These dependencies are * being worked on but are not yet fixed. From 21eccbf53359ab64b3808d819f00f2f34dcc5dfc Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 10 Jan 2024 10:21:09 +0100 Subject: [PATCH 389/707] xfs: disable large folio support in xfile_create MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The xfarray code will crash if large folios are force enabled using: echo force > /sys/kernel/mm/transparent_hugepage/shmem_enabled Fixing this will require a bit of an API change, and prefeably sorting out the hwpoison story for pages vs folio and where it is placed in the shmem API. For now use this one liner to disable large folios. Link: https://lkml.kernel.org/r/20240110092109.1950011-3-hch@lst.de Fixes: 3934e8ebb7cc ("xfs: create a big array data structure") Reported-by: Darrick J. Wong Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Cc: Chandan Babu R Cc: Christian König Cc: Daniel Vetter Cc: Dave Airlie Cc: Dave Hansen Cc: David Howells Cc: Huang Rui Cc: Hugh Dickins Cc: Jani Nikula Cc: Jarkko Sakkinen Cc: Joonas Lahtinen Cc: Maarten Lankhorst Cc: Matthew Wilcox (Oracle) Cc: Maxime Ripard Cc: Rodrigo Vivi Cc: Thomas Zimmermann Cc: Tvrtko Ursulin Cc: Signed-off-by: Andrew Morton --- fs/xfs/scrub/xfile.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/xfs/scrub/xfile.c b/fs/xfs/scrub/xfile.c index 090c3ead43fdf1..1a8d1bedd0b0dc 100644 --- a/fs/xfs/scrub/xfile.c +++ b/fs/xfs/scrub/xfile.c @@ -94,6 +94,11 @@ xfile_create( lockdep_set_class(&inode->i_rwsem, &xfile_i_mutex_key); + /* + * We're not quite ready for large folios yet. + */ + mapping_clear_large_folios(inode->i_mapping); + trace_xfile_create(xf); *xfilep = xf; From 252ab4b7f958ad64d75661280b40529c1f18bc43 Mon Sep 17 00:00:00 2001 From: Prakash Sangappa Date: Tue, 23 Jan 2024 12:04:42 -0800 Subject: [PATCH 390/707] mm: hugetlb pages should not be reserved by shmat() if SHM_NORESERVE For shared memory of type SHM_HUGETLB, hugetlb pages are reserved in shmget() call. If SHM_NORESERVE flags is specified then the hugetlb pages are not reserved. However when the shared memory is attached with the shmat() call the hugetlb pages are getting reserved incorrectly for SHM_HUGETLB shared memory created with SHM_NORESERVE which is a bug. ------------------------------- Following test shows the issue. $cat shmhtb.c int main() { int shmflags = 0660 | IPC_CREAT | SHM_HUGETLB | SHM_NORESERVE; int shmid; shmid = shmget(SKEY, SHMSZ, shmflags); if (shmid < 0) { printf("shmat: shmget() failed, %d\n", errno); return 1; } printf("After shmget()\n"); system("cat /proc/meminfo | grep -i hugepages_"); shmat(shmid, NULL, 0); printf("\nAfter shmat()\n"); system("cat /proc/meminfo | grep -i hugepages_"); shmctl(shmid, IPC_RMID, NULL); return 0; } #sysctl -w vm.nr_hugepages=20 #./shmhtb After shmget() HugePages_Total: 20 HugePages_Free: 20 HugePages_Rsvd: 0 HugePages_Surp: 0 After shmat() HugePages_Total: 20 HugePages_Free: 20 HugePages_Rsvd: 5 <-- HugePages_Surp: 0 -------------------------------- Fix is to ensure that hugetlb pages are not reserved for SHM_HUGETLB shared memory in the shmat() call. Link: https://lkml.kernel.org/r/1706040282-12388-1-git-send-email-prakash.sangappa@oracle.com Signed-off-by: Prakash Sangappa Acked-by: Muchun Song Cc: Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 671664fed3077f..ee13c2ca8ad293 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -100,6 +100,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) loff_t len, vma_len; int ret; struct hstate *h = hstate_file(file); + vm_flags_t vm_flags; /* * vma address alignment (but not the pgoff alignment) has @@ -141,10 +142,20 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) file_accessed(file); ret = -ENOMEM; + + vm_flags = vma->vm_flags; + /* + * for SHM_HUGETLB, the pages are reserved in the shmget() call so skip + * reserving here. Note: only for SHM hugetlbfs file, the inode + * flag S_PRIVATE is set. + */ + if (inode->i_flags & S_PRIVATE) + vm_flags |= VM_NORESERVE; + if (!hugetlb_reserve_pages(inode, vma->vm_pgoff >> huge_page_order(h), len >> huge_page_shift(h), vma, - vma->vm_flags)) + vm_flags)) goto out; ret = 0; From b0f130279ae18085fa6d156c4316fb2a4c1a6d87 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 22 Jan 2024 16:50:50 +0100 Subject: [PATCH 391/707] getrusage: move thread_group_cputime_adjusted() outside of lock_task_sighand() Patch series "getrusage: use sig->stats_lock", v2. This patch (of 2): thread_group_cputime() does its own locking, we can safely shift thread_group_cputime_adjusted() which does another for_each_thread loop outside of ->siglock protected section. This is also preparation for the next patch which changes getrusage() to use stats_lock instead of siglock, thread_group_cputime() takes the same lock. With the current implementation recursive read_seqbegin_or_lock() is fine, thread_group_cputime() can't enter the slow mode if the caller holds stats_lock, yet this looks more safe and better performance-wise. Link: https://lkml.kernel.org/r/20240122155023.GA26169@redhat.com Link: https://lkml.kernel.org/r/20240122155050.GA26205@redhat.com Signed-off-by: Oleg Nesterov Reported-by: Dylan Hatch Tested-by: Dylan Hatch Cc: Eric W. Biederman Cc: Signed-off-by: Andrew Morton --- kernel/sys.c | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/kernel/sys.c b/kernel/sys.c index e219fcfa112d86..70ad06ad852e59 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1785,17 +1785,19 @@ void getrusage(struct task_struct *p, int who, struct rusage *r) struct task_struct *t; unsigned long flags; u64 tgutime, tgstime, utime, stime; - unsigned long maxrss = 0; + unsigned long maxrss; + struct mm_struct *mm; struct signal_struct *sig = p->signal; - memset((char *)r, 0, sizeof (*r)); + memset(r, 0, sizeof(*r)); utime = stime = 0; + maxrss = 0; if (who == RUSAGE_THREAD) { task_cputime_adjusted(current, &utime, &stime); accumulate_thread_rusage(p, r); maxrss = sig->maxrss; - goto out; + goto out_thread; } if (!lock_task_sighand(p, &flags)) @@ -1819,9 +1821,6 @@ void getrusage(struct task_struct *p, int who, struct rusage *r) fallthrough; case RUSAGE_SELF: - thread_group_cputime_adjusted(p, &tgutime, &tgstime); - utime += tgutime; - stime += tgstime; r->ru_nvcsw += sig->nvcsw; r->ru_nivcsw += sig->nivcsw; r->ru_minflt += sig->min_flt; @@ -1839,19 +1838,24 @@ void getrusage(struct task_struct *p, int who, struct rusage *r) } unlock_task_sighand(p, &flags); -out: - r->ru_utime = ns_to_kernel_old_timeval(utime); - r->ru_stime = ns_to_kernel_old_timeval(stime); + if (who == RUSAGE_CHILDREN) + goto out_children; - if (who != RUSAGE_CHILDREN) { - struct mm_struct *mm = get_task_mm(p); + thread_group_cputime_adjusted(p, &tgutime, &tgstime); + utime += tgutime; + stime += tgstime; - if (mm) { - setmax_mm_hiwater_rss(&maxrss, mm); - mmput(mm); - } +out_thread: + mm = get_task_mm(p); + if (mm) { + setmax_mm_hiwater_rss(&maxrss, mm); + mmput(mm); } + +out_children: r->ru_maxrss = maxrss * (PAGE_SIZE / 1024); /* convert pages to KBs */ + r->ru_utime = ns_to_kernel_old_timeval(utime); + r->ru_stime = ns_to_kernel_old_timeval(stime); } SYSCALL_DEFINE2(getrusage, int, who, struct rusage __user *, ru) From f51687c6deb99c0ba0b8457b8be06312425bdd91 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 22 Jan 2024 16:50:53 +0100 Subject: [PATCH 392/707] getrusage: use sig->stats_lock rather than lock_task_sighand() lock_task_sighand() can trigger a hard lockup. If NR_CPUS threads call getrusage() at the same time and the process has NR_THREADS, spin_lock_irq will spin with irqs disabled O(NR_CPUS * NR_THREADS) time. Change getrusage() to use sig->stats_lock, it was specifically designed for this type of use. This way it runs lockless in the likely case. TODO: - Change do_task_stat() to use sig->stats_lock too, then we can remove spin_lock_irq(siglock) in wait_task_zombie(). - Turn sig->stats_lock into seqcount_rwlock_t, this way the readers in the slow mode won't exclude each other. See https://lore.kernel.org/all/20230913154907.GA26210@redhat.com/ - stats_lock has to disable irqs because ->siglock can be taken in irq context, it would be very nice to change __exit_signal() to avoid the siglock->stats_lock dependency. Link: https://lkml.kernel.org/r/20240122155053.GA26214@redhat.com Signed-off-by: Oleg Nesterov Reported-by: Dylan Hatch Tested-by: Dylan Hatch Cc: Eric W. Biederman Cc: Signed-off-by: Andrew Morton --- kernel/sys.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/kernel/sys.c b/kernel/sys.c index 70ad06ad852e59..f8e543f1e38a06 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1788,7 +1788,9 @@ void getrusage(struct task_struct *p, int who, struct rusage *r) unsigned long maxrss; struct mm_struct *mm; struct signal_struct *sig = p->signal; + unsigned int seq = 0; +retry: memset(r, 0, sizeof(*r)); utime = stime = 0; maxrss = 0; @@ -1800,8 +1802,7 @@ void getrusage(struct task_struct *p, int who, struct rusage *r) goto out_thread; } - if (!lock_task_sighand(p, &flags)) - return; + flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq); switch (who) { case RUSAGE_BOTH: @@ -1829,14 +1830,23 @@ void getrusage(struct task_struct *p, int who, struct rusage *r) r->ru_oublock += sig->oublock; if (maxrss < sig->maxrss) maxrss = sig->maxrss; + + rcu_read_lock(); __for_each_thread(sig, t) accumulate_thread_rusage(t, r); + rcu_read_unlock(); + break; default: BUG(); } - unlock_task_sighand(p, &flags); + + if (need_seqretry(&sig->stats_lock, seq)) { + seq = 1; + goto retry; + } + done_seqretry_irqrestore(&sig->stats_lock, seq, flags); if (who == RUSAGE_CHILDREN) goto out_children; From 41d1a01ebc166ba59fe5ec3e4eb32271f0267b31 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 23 Jan 2024 16:33:55 +0100 Subject: [PATCH 393/707] fs/proc: do_task_stat: move thread_group_cputime_adjusted() outside of lock_task_sighand() Patch series "fs/proc: do_task_stat: use sig->stats_". do_task_stat() has the same problem as getrusage() had before "getrusage: use sig->stats_lock rather than lock_task_sighand()": a hard lockup. If NR_CPUS threads call lock_task_sighand() at the same time and the process has NR_THREADS, spin_lock_irq will spin with irqs disabled O(NR_CPUS * NR_THREADS) time. This patch (of 3): thread_group_cputime() does its own locking, we can safely shift thread_group_cputime_adjusted() which does another for_each_thread loop outside of ->siglock protected section. Not only this removes for_each_thread() from the critical section with irqs disabled, this removes another case when stats_lock is taken with siglock held. We want to remove this dependency, then we can change the users of stats_lock to not disable irqs. Link: https://lkml.kernel.org/r/20240123153313.GA21832@redhat.com Link: https://lkml.kernel.org/r/20240123153355.GA21854@redhat.com Signed-off-by: Oleg Nesterov Signed-off-by: Dylan Hatch Cc: Eric W. Biederman Cc: Signed-off-by: Andrew Morton --- fs/proc/array.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/fs/proc/array.c b/fs/proc/array.c index ff08a8957552ad..45ba9186380843 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -511,7 +511,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, sigemptyset(&sigign); sigemptyset(&sigcatch); - cutime = cstime = utime = stime = 0; + cutime = cstime = 0; cgtime = gtime = 0; if (lock_task_sighand(task, &flags)) { @@ -546,7 +546,6 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, min_flt += sig->min_flt; maj_flt += sig->maj_flt; - thread_group_cputime_adjusted(task, &utime, &stime); gtime += sig->gtime; if (sig->flags & (SIGNAL_GROUP_EXIT | SIGNAL_STOP_STOPPED)) @@ -562,10 +561,13 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, if (permitted && (!whole || num_threads < 2)) wchan = !task_is_running(task); - if (!whole) { + + if (whole) { + thread_group_cputime_adjusted(task, &utime, &stime); + } else { + task_cputime_adjusted(task, &utime, &stime); min_flt = task->min_flt; maj_flt = task->maj_flt; - task_cputime_adjusted(task, &utime, &stime); gtime = task_gtime(task); } From 48596ced7a2f19e820304fda65e1d819544fc256 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 23 Jan 2024 16:33:57 +0100 Subject: [PATCH 394/707] fs/proc: do_task_stat: use sig->stats_lock to gather the threads/children stats lock_task_sighand() can trigger a hard lockup. If NR_CPUS threads call do_task_stat() at the same time and the process has NR_THREADS, it will spin with irqs disabled O(NR_CPUS * NR_THREADS) time. Change do_task_stat() to use sig->stats_lock to gather the statistics outside of ->siglock protected section, in the likely case this code will run lockless. Link: https://lkml.kernel.org/r/20240123153357.GA21857@redhat.com Signed-off-by: Oleg Nesterov Signed-off-by: Dylan Hatch Cc: Eric W. Biederman Cc: Signed-off-by: Andrew Morton --- fs/proc/array.c | 58 +++++++++++++++++++++++++++---------------------- 1 file changed, 32 insertions(+), 26 deletions(-) diff --git a/fs/proc/array.c b/fs/proc/array.c index 45ba9186380843..34a47fb0c57f25 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -477,13 +477,13 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, int permitted; struct mm_struct *mm; unsigned long long start_time; - unsigned long cmin_flt = 0, cmaj_flt = 0; - unsigned long min_flt = 0, maj_flt = 0; - u64 cutime, cstime, utime, stime; - u64 cgtime, gtime; + unsigned long cmin_flt, cmaj_flt, min_flt, maj_flt; + u64 cutime, cstime, cgtime, utime, stime, gtime; unsigned long rsslim = 0; unsigned long flags; int exit_code = task->exit_code; + struct signal_struct *sig = task->signal; + unsigned int seq = 1; state = *get_task_state(task); vsize = eip = esp = 0; @@ -511,12 +511,8 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, sigemptyset(&sigign); sigemptyset(&sigcatch); - cutime = cstime = 0; - cgtime = gtime = 0; if (lock_task_sighand(task, &flags)) { - struct signal_struct *sig = task->signal; - if (sig->tty) { struct pid *pgrp = tty_get_pgrp(sig->tty); tty_pgrp = pid_nr_ns(pgrp, ns); @@ -527,27 +523,9 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, num_threads = get_nr_threads(task); collect_sigign_sigcatch(task, &sigign, &sigcatch); - cmin_flt = sig->cmin_flt; - cmaj_flt = sig->cmaj_flt; - cutime = sig->cutime; - cstime = sig->cstime; - cgtime = sig->cgtime; rsslim = READ_ONCE(sig->rlim[RLIMIT_RSS].rlim_cur); - /* add up live thread stats at the group level */ if (whole) { - struct task_struct *t; - - __for_each_thread(sig, t) { - min_flt += t->min_flt; - maj_flt += t->maj_flt; - gtime += task_gtime(t); - } - - min_flt += sig->min_flt; - maj_flt += sig->maj_flt; - gtime += sig->gtime; - if (sig->flags & (SIGNAL_GROUP_EXIT | SIGNAL_STOP_STOPPED)) exit_code = sig->group_exit_code; } @@ -562,6 +540,34 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, if (permitted && (!whole || num_threads < 2)) wchan = !task_is_running(task); + do { + seq++; /* 2 on the 1st/lockless path, otherwise odd */ + flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq); + + cmin_flt = sig->cmin_flt; + cmaj_flt = sig->cmaj_flt; + cutime = sig->cutime; + cstime = sig->cstime; + cgtime = sig->cgtime; + + if (whole) { + struct task_struct *t; + + min_flt = sig->min_flt; + maj_flt = sig->maj_flt; + gtime = sig->gtime; + + rcu_read_lock(); + __for_each_thread(sig, t) { + min_flt += t->min_flt; + maj_flt += t->maj_flt; + gtime += task_gtime(t); + } + rcu_read_unlock(); + } + } while (need_seqretry(&sig->stats_lock, seq)); + done_seqretry_irqrestore(&sig->stats_lock, seq, flags); + if (whole) { thread_group_cputime_adjusted(task, &utime, &stime); } else { From b608b1d1b11df82d2bb1ab7a071e3b213a5b0f9c Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 23 Jan 2024 16:34:00 +0100 Subject: [PATCH 395/707] exit: wait_task_zombie: kill the no longer necessary spin_lock_irq(siglock) After the recent changes nobody use siglock to read the values protected by stats_lock, we can kill spin_lock_irq(¤t->sighand->siglock) and update the comment. With this patch only __exit_signal() and thread_group_start_cputime() take stats_lock under siglock. Link: https://lkml.kernel.org/r/20240123153359.GA21866@redhat.com Signed-off-by: Oleg Nesterov Signed-off-by: Dylan Hatch Cc: Eric W. Biederman Cc: Signed-off-by: Andrew Morton --- kernel/exit.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/kernel/exit.c b/kernel/exit.c index 3988a02efaef06..dfb963d2f862ad 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1127,17 +1127,14 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) * and nobody can change them. * * psig->stats_lock also protects us from our sub-threads - * which can reap other children at the same time. Until - * we change k_getrusage()-like users to rely on this lock - * we have to take ->siglock as well. + * which can reap other children at the same time. * * We use thread_group_cputime_adjusted() to get times for * the thread group, which consolidates times for all threads * in the group including the group leader. */ thread_group_cputime_adjusted(p, &tgutime, &tgstime); - spin_lock_irq(¤t->sighand->siglock); - write_seqlock(&psig->stats_lock); + write_seqlock_irq(&psig->stats_lock); psig->cutime += tgutime + sig->cutime; psig->cstime += tgstime + sig->cstime; psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime; @@ -1160,8 +1157,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) psig->cmaxrss = maxrss; task_io_accounting_add(&psig->ioac, &p->ioac); task_io_accounting_add(&psig->ioac, &sig->ioac); - write_sequnlock(&psig->stats_lock); - spin_unlock_irq(¤t->sighand->siglock); + write_sequnlock_irq(&psig->stats_lock); } if (wo->wo_rusage) From d5e91629e5c426a872a356bdbbdd29a50d91afef Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Tue, 23 Jan 2024 14:17:55 +0000 Subject: [PATCH 396/707] mm/userfaultfd: UFFDIO_MOVE implementation should use ptep_get() Commit c33c794828f2 ("mm: ptep_get() conversion") converted all (non-arch) call sites to use ptep_get() instead of doing a direct dereference of the pte. Full rationale can be found in that commit's log. Since then, UFFDIO_MOVE has been implemented which does 7 direct pte dereferences. Let's fix those up to use ptep_get(). I've asserted in the past that there is no reliable automated mechanism to catch these; I'm relying on a combination of Coccinelle (which throws up a lot of false positives) and some compiler magic to force a compiler error on dereference. But given the frequency with which new issues are coming up, I'll add it to my todo list to try to find an automated solution. Link: https://lkml.kernel.org/r/20240123141755.3836179-1-ryan.roberts@arm.com Fixes: adef440691ba ("userfaultfd: UFFDIO_MOVE uABI") Signed-off-by: Ryan Roberts Reviewed-by: Suren Baghdasaryan Cc: Andrea Arcangeli Signed-off-by: Andrew Morton --- mm/userfaultfd.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 75fcf1f783bc56..7cf7d43842590c 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -902,8 +902,8 @@ static int move_present_pte(struct mm_struct *mm, double_pt_lock(dst_ptl, src_ptl); - if (!pte_same(*src_pte, orig_src_pte) || - !pte_same(*dst_pte, orig_dst_pte)) { + if (!pte_same(ptep_get(src_pte), orig_src_pte) || + !pte_same(ptep_get(dst_pte), orig_dst_pte)) { err = -EAGAIN; goto out; } @@ -946,8 +946,8 @@ static int move_swap_pte(struct mm_struct *mm, double_pt_lock(dst_ptl, src_ptl); - if (!pte_same(*src_pte, orig_src_pte) || - !pte_same(*dst_pte, orig_dst_pte)) { + if (!pte_same(ptep_get(src_pte), orig_src_pte) || + !pte_same(ptep_get(dst_pte), orig_dst_pte)) { double_pt_unlock(dst_ptl, src_ptl); return -EAGAIN; } @@ -1016,7 +1016,7 @@ static int move_pages_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, } spin_lock(dst_ptl); - orig_dst_pte = *dst_pte; + orig_dst_pte = ptep_get(dst_pte); spin_unlock(dst_ptl); if (!pte_none(orig_dst_pte)) { err = -EEXIST; @@ -1024,7 +1024,7 @@ static int move_pages_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, } spin_lock(src_ptl); - orig_src_pte = *src_pte; + orig_src_pte = ptep_get(src_pte); spin_unlock(src_ptl); if (pte_none(orig_src_pte)) { if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES)) @@ -1054,7 +1054,7 @@ static int move_pages_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, * page isn't freed under us */ spin_lock(src_ptl); - if (!pte_same(orig_src_pte, *src_pte)) { + if (!pte_same(orig_src_pte, ptep_get(src_pte))) { spin_unlock(src_ptl); err = -EAGAIN; goto out; From 95bfb99de89e0636c52a00472352f7f8e93cfa41 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Wed, 24 Jan 2024 21:19:36 +0900 Subject: [PATCH 397/707] nilfs2: fix data corruption in dsync block recovery for small block sizes The helper function nilfs_recovery_copy_block() of nilfs_recovery_dsync_blocks(), which recovers data from logs created by data sync writes during a mount after an unclean shutdown, incorrectly calculates the on-page offset when copying repair data to the file's page cache. In environments where the block size is smaller than the page size, this flaw can cause data corruption and leak uninitialized memory bytes during the recovery process. Fix these issues by correcting this byte offset calculation on the page. Link: https://lkml.kernel.org/r/20240124121936.10575-1-konishi.ryusuke@gmail.com Signed-off-by: Ryusuke Konishi Tested-by: Ryusuke Konishi Cc: Signed-off-by: Andrew Morton --- fs/nilfs2/recovery.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c index 0955b657938ff2..a9b8d77c8c1d55 100644 --- a/fs/nilfs2/recovery.c +++ b/fs/nilfs2/recovery.c @@ -472,9 +472,10 @@ static int nilfs_prepare_segment_for_recovery(struct the_nilfs *nilfs, static int nilfs_recovery_copy_block(struct the_nilfs *nilfs, struct nilfs_recovery_block *rb, - struct page *page) + loff_t pos, struct page *page) { struct buffer_head *bh_org; + size_t from = pos & ~PAGE_MASK; void *kaddr; bh_org = __bread(nilfs->ns_bdev, rb->blocknr, nilfs->ns_blocksize); @@ -482,7 +483,7 @@ static int nilfs_recovery_copy_block(struct the_nilfs *nilfs, return -EIO; kaddr = kmap_atomic(page); - memcpy(kaddr + bh_offset(bh_org), bh_org->b_data, bh_org->b_size); + memcpy(kaddr + from, bh_org->b_data, bh_org->b_size); kunmap_atomic(kaddr); brelse(bh_org); return 0; @@ -521,7 +522,7 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs, goto failed_inode; } - err = nilfs_recovery_copy_block(nilfs, rb, page); + err = nilfs_recovery_copy_block(nilfs, rb, pos, page); if (unlikely(err)) goto failed_page; From a06040dcc590b0e90be9c524ad05089cdd7858a4 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Wed, 24 Jan 2024 10:00:22 +0000 Subject: [PATCH 398/707] mm: memcg: optimize parent iteration in memcg_rstat_updated() In memcg_rstat_updated(), we iterate the memcg being updated and its parents to update memcg->vmstats_percpu->stats_updates in the fast path (i.e. no atomic updates). According to my math, this is 3 memory loads (and potentially 3 cache misses) per memcg: - Load the address of memcg->vmstats_percpu. - Load vmstats_percpu->stats_updates (based on some percpu calculation). - Load the address of the parent memcg. Avoid most of the cache misses by caching a pointer from each struct memcg_vmstats_percpu to its parent on the corresponding CPU. In this case, for the first memcg we have 2 memory loads (same as above): - Load the address of memcg->vmstats_percpu. - Load vmstats_percpu->stats_updates (based on some percpu calculation). Then for each additional memcg, we need a single load to get the parent's stats_updates directly. This reduces the number of loads from O(3N) to O(2+N) -- where N is the number of memcgs we need to iterate. Additionally, stash a pointer to memcg->vmstats in each struct memcg_vmstats_percpu such that we can access the atomic counter that all CPUs fold into, memcg->vmstats->stats_updates. memcg_should_flush_stats() is changed to memcg_vmstats_needs_flush() to accept a struct memcg_vmstats pointer accordingly. In struct memcg_vmstats_percpu, make sure both pointers together with stats_updates live on the same cacheline. Finally, update mem_cgroup_alloc() to take in a parent pointer and initialize the new cache pointers on each CPU. The percpu loop in mem_cgroup_alloc() may look concerning, but there are multiple similar loops in the cgroup creation path (e.g. cgroup_rstat_init()), most of which are hidden within alloc_percpu(). According to Oliver's testing [1], this fixes multiple 30-38% regressions in vm-scalability, will-it-scale-tlb_flush2, and will-it-scale-fallocate1. This comes at a cost of 2 more pointers per CPU (<2KB on a machine with 128 CPUs). [1] https://lore.kernel.org/lkml/ZbDJsfsZt2ITyo61@xsang-OptiPlex-9020/ Link: https://lkml.kernel.org/r/20240124100023.660032-1-yosryahmed@google.com Signed-off-by: Yosry Ahmed Fixes: 8d59d2214c23 ("mm: memcg: make stats flushing threshold per-memcg") Tested-by: kernel test robot Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202401221624.cb53a8ca-oliver.sang@intel.com Acked-by: Shakeel Butt Acked-by: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Signed-off-by: Andrew Morton --- mm/memcontrol.c | 49 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 32 insertions(+), 17 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 46d8d02114cfee..d9ca0fdbe4ab04 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -633,8 +633,15 @@ struct memcg_vmstats_percpu { unsigned long nr_page_events; unsigned long targets[MEM_CGROUP_NTARGETS]; + /* Fit members below in a single cacheline for memcg_rstat_updated() */ + CACHELINE_PADDING(_pad1_); + /* Stats updates since the last flush */ unsigned int stats_updates; + + /* Cached pointers for fast iteration in memcg_rstat_updated() */ + struct memcg_vmstats_percpu *parent; + struct memcg_vmstats *vmstats; }; struct memcg_vmstats { @@ -698,36 +705,35 @@ static void memcg_stats_unlock(void) } -static bool memcg_should_flush_stats(struct mem_cgroup *memcg) +static bool memcg_vmstats_needs_flush(struct memcg_vmstats *vmstats) { - return atomic64_read(&memcg->vmstats->stats_updates) > + return atomic64_read(&vmstats->stats_updates) > MEMCG_CHARGE_BATCH * num_online_cpus(); } static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val) { + struct memcg_vmstats_percpu *statc; int cpu = smp_processor_id(); - unsigned int x; if (!val) return; cgroup_rstat_updated(memcg->css.cgroup, cpu); - - for (; memcg; memcg = parent_mem_cgroup(memcg)) { - x = __this_cpu_add_return(memcg->vmstats_percpu->stats_updates, - abs(val)); - - if (x < MEMCG_CHARGE_BATCH) + statc = this_cpu_ptr(memcg->vmstats_percpu); + for (; statc; statc = statc->parent) { + statc->stats_updates += abs(val); + if (statc->stats_updates < MEMCG_CHARGE_BATCH) continue; /* * If @memcg is already flush-able, increasing stats_updates is * redundant. Avoid the overhead of the atomic update. */ - if (!memcg_should_flush_stats(memcg)) - atomic64_add(x, &memcg->vmstats->stats_updates); - __this_cpu_write(memcg->vmstats_percpu->stats_updates, 0); + if (!memcg_vmstats_needs_flush(statc->vmstats)) + atomic64_add(statc->stats_updates, + &statc->vmstats->stats_updates); + statc->stats_updates = 0; } } @@ -756,7 +762,7 @@ void mem_cgroup_flush_stats(struct mem_cgroup *memcg) if (!memcg) memcg = root_mem_cgroup; - if (memcg_should_flush_stats(memcg)) + if (memcg_vmstats_needs_flush(memcg->vmstats)) do_flush_stats(memcg); } @@ -770,7 +776,7 @@ void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg) static void flush_memcg_stats_dwork(struct work_struct *w) { /* - * Deliberately ignore memcg_should_flush_stats() here so that flushing + * Deliberately ignore memcg_vmstats_needs_flush() here so that flushing * in latency-sensitive paths is as cheap as possible. */ do_flush_stats(root_mem_cgroup); @@ -5477,10 +5483,11 @@ static void mem_cgroup_free(struct mem_cgroup *memcg) __mem_cgroup_free(memcg); } -static struct mem_cgroup *mem_cgroup_alloc(void) +static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent) { + struct memcg_vmstats_percpu *statc, *pstatc; struct mem_cgroup *memcg; - int node; + int node, cpu; int __maybe_unused i; long error = -ENOMEM; @@ -5504,6 +5511,14 @@ static struct mem_cgroup *mem_cgroup_alloc(void) if (!memcg->vmstats_percpu) goto fail; + for_each_possible_cpu(cpu) { + if (parent) + pstatc = per_cpu_ptr(parent->vmstats_percpu, cpu); + statc = per_cpu_ptr(memcg->vmstats_percpu, cpu); + statc->parent = parent ? pstatc : NULL; + statc->vmstats = memcg->vmstats; + } + for_each_node(node) if (alloc_mem_cgroup_per_node_info(memcg, node)) goto fail; @@ -5549,7 +5564,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) struct mem_cgroup *memcg, *old_memcg; old_memcg = set_active_memcg(parent); - memcg = mem_cgroup_alloc(); + memcg = mem_cgroup_alloc(parent); set_active_memcg(old_memcg); if (IS_ERR(memcg)) return ERR_CAST(memcg); From 38312a7658804dd56d43f78d5081d9e742d9115d Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Wed, 24 Jan 2024 16:40:14 +0800 Subject: [PATCH 399/707] mm/memory-failure: fix crash in split_huge_page_to_list from soft_offline_page When I did soft offline stress test, a machine was observed to crash with the following message: kernel BUG at include/linux/memcontrol.h:554! invalid opcode: 0000 [#1] PREEMPT SMP NOPTI CPU: 5 PID: 3837 Comm: hwpoison.sh Not tainted 6.7.0-next-20240112-00001-g8ecf3e7fb7c8-dirty #97 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014 RIP: 0010:folio_memcg+0xaf/0xd0 Code: 10 5b 5d c3 cc cc cc cc 48 c7 c6 08 b1 f2 b2 48 89 ef e8 b4 c5 f8 ff 90 0f 0b 48 c7 c6 d0 b0 f2 b2 48 89 ef e8 a2 c5 f8 ff 90 <0f> 0b 48 c7 c6 08 b1 f2 b2 48 89 ef e8 90 c5 f8 ff 90 0f 0b 66 66 RSP: 0018:ffffb6c043657c98 EFLAGS: 00000296 RAX: 000000000000004b RBX: ffff932bc1d1e401 RCX: ffff933abfb5c908 RDX: 0000000000000000 RSI: 0000000000000027 RDI: ffff933abfb5c900 RBP: ffffea6f04019080 R08: ffffffffb3338ce8 R09: 0000000000009ffb R10: 00000000000004dd R11: ffffffffb3308d00 R12: ffffea6f04019080 R13: ffffea6f04019080 R14: 0000000000000001 R15: ffffb6c043657da0 FS: 00007f6c60f6b740(0000) GS:ffff933abfb40000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000559c3bc8b980 CR3: 0000000107f1c000 CR4: 00000000000006f0 Call Trace: split_huge_page_to_list+0x4d/0x1380 try_to_split_thp_page+0x3a/0xf0 soft_offline_page+0x1ea/0x8a0 soft_offline_page_store+0x52/0x90 kernfs_fop_write_iter+0x118/0x1b0 vfs_write+0x30b/0x430 ksys_write+0x5e/0xe0 do_syscall_64+0xb0/0x1b0 entry_SYSCALL_64_after_hwframe+0x6d/0x75 RIP: 0033:0x7f6c60d14697 Code: 10 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff eb b7 0f 1f 00 f3 0f 1e fa 64 8b 04 25 18 00 00 00 85 c0 75 10 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 51 c3 48 83 ec 28 48 89 54 24 18 48 89 74 24 RSP: 002b:00007ffe9b72b8d8 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 RAX: ffffffffffffffda RBX: 000000000000000c RCX: 00007f6c60d14697 RDX: 000000000000000c RSI: 0000559c3bc8b980 RDI: 0000000000000001 RBP: 0000559c3bc8b980 R08: 00007f6c60dd1460 R09: 000000007fffffff R10: 0000000000000000 R11: 0000000000000246 R12: 000000000000000c R13: 00007f6c60e1a780 R14: 00007f6c60e16600 R15: 00007f6c60e15a00 The problem is that page->mapping is overloaded with slab->slab_list or slabs fields now, so slab pages could be taken as non-LRU movable pages if field slabs contains PAGE_MAPPING_MOVABLE or slab_list->prev is set to LIST_POISON2. These slab pages will be treated as thp later leading to crash in split_huge_page_to_list(). Link: https://lkml.kernel.org/r/20240126065837.2100184-1-linmiaohe@huawei.com Link: https://lkml.kernel.org/r/20240124084014.1772906-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Fixes: 130d4df57390 ("mm/sl[au]b: rearrange struct slab fields to allow larger rcu_head") Reviewed-by: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Naoya Horiguchi Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/memory-failure.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 636280d04008d8..9349948f1abfd1 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1377,6 +1377,9 @@ void ClearPageHWPoisonTakenOff(struct page *page) */ static inline bool HWPoisonHandlable(struct page *page, unsigned long flags) { + if (PageSlab(page)) + return false; + /* Soft offline could migrate non-LRU movable pages */ if ((flags & MF_SOFT_OFFLINE) && __PageMovable(page)) return true; From cf5e3522d28ca30102d7d61eb504b4eaa338da33 Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Tue, 24 Oct 2023 20:51:25 +0500 Subject: [PATCH 400/707] selftests: core: include linux/close_range.h for CLOSE_RANGE_* macros Correct header file is needed for getting CLOSE_RANGE_* macros. Previously it was tested with newer glibc which didn't show the need to include the header which was a mistake. Link: https://lkml.kernel.org/r/20231024155137.219700-1-usama.anjum@collabora.com Fixes: ec54424923cf ("selftests: core: remove duplicate defines") Reported-by: Aishwarya TCV Link: https://lore.kernel.org/all/7161219e-0223-d699-d6f3-81abd9abf13b@arm.com Signed-off-by: Muhammad Usama Anjum Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/core/close_range_test.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/core/close_range_test.c b/tools/testing/selftests/core/close_range_test.c index 534576f06df1cc..c59e4adb905df6 100644 --- a/tools/testing/selftests/core/close_range_test.c +++ b/tools/testing/selftests/core/close_range_test.c @@ -12,6 +12,7 @@ #include #include #include +#include #include "../kselftest_harness.h" #include "../clone3/clone3_selftests.h" From 6c50951b21b502346557e08ca125ec1b372b92b1 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Mon, 22 Jan 2024 22:43:05 -0800 Subject: [PATCH 401/707] arch/arm/mm: fix major fault accounting when retrying under per-VMA lock The change [1] missed ARM architecture when fixing major fault accounting for page fault retry under per-VMA lock. The user-visible effects is that it restores correct major fault accounting that was broken after [2] was merged in 6.7 kernel. The more detailed description is in [3] and this patch simply adds the same fix to ARM architecture which I missed in [3]. Add missing code to fix ARM architecture fault accounting. [1] 46e714c729c8 ("arch/mm/fault: fix major fault accounting when retrying under per-VMA lock") [2] https://lore.kernel.org/all/20231006195318.4087158-6-willy@infradead.org/ [3] https://lore.kernel.org/all/20231226214610.109282-1-surenb@google.com/ Link: https://lkml.kernel.org/r/20240123064305.2829244-1-surenb@google.com Fixes: 12214eba1992 ("mm: handle read faults under the VMA lock") Reported-by: Russell King (Oracle) Signed-off-by: Suren Baghdasaryan Cc: Alexander Gordeev Cc: Andy Lutomirski Cc: Catalin Marinas Cc: Christophe Leroy Cc: Dave Hansen Cc: Gerald Schaefer Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Palmer Dabbelt Cc: Peter Zijlstra Cc: Will Deacon Cc: Signed-off-by: Andrew Morton --- arch/arm/mm/fault.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c index e96fb40b9cc32a..07565b593ed681 100644 --- a/arch/arm/mm/fault.c +++ b/arch/arm/mm/fault.c @@ -298,6 +298,8 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) goto done; } count_vm_vma_lock_event(VMA_LOCK_RETRY); + if (fault & VM_FAULT_MAJOR) + flags |= FAULT_FLAG_TRIED; /* Quick path to respond to signals */ if (fault_signal_pending(fault, regs)) { From 489bd091f9465e4b4b77c812255642d596d43e72 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Fri, 26 Jan 2024 12:25:48 +0900 Subject: [PATCH 402/707] mm/madvise: don't forget to leave lazy MMU mode in madvise_cold_or_pageout_pte_range() We need to leave lazy MMU mode before unlocking. Link: https://lkml.kernel.org/r/20240126032608.355899-1-senozhatsky@chromium.org Fixes: b2f557a21bc8 ("mm/madvise: add cond_resched() in madvise_cold_or_pageout_pte_range()") Signed-off-by: Sergey Senozhatsky Cc: Jiexun Wang Signed-off-by: Andrew Morton --- mm/madvise.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/madvise.c b/mm/madvise.c index 912155a94ed587..cfa5e728826118 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -429,6 +429,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, if (++batch_count == SWAP_CLUSTER_MAX) { batch_count = 0; if (need_resched()) { + arch_leave_lazy_mmu_mode(); pte_unmap_unlock(start_pte, ptl); cond_resched(); goto restart; From a48c290a3da03746500fdf1c566a40f62154df0b Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 29 Jan 2024 20:34:38 -0500 Subject: [PATCH 403/707] mm: zswap: fix objcg use-after-free in entry destruction In the per-memcg LRU universe, LRU removal uses entry->objcg to determine which list count needs to be decreased. Drop the objcg reference after updating the LRU, to fix a possible use-after-free. Link: https://lkml.kernel.org/r/20240130013438.565167-1-hannes@cmpxchg.org Fixes: a65b0e7607cc ("zswap: make shrinking memcg-aware") Signed-off-by: Johannes Weiner Acked-by: Yosry Ahmed Reviewed-by: Nhat Pham Reviewed-by: Chengming Zhou Signed-off-by: Andrew Morton --- mm/zswap.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index ca25b676048ea6..0a94b197ed32e1 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -536,10 +536,6 @@ static struct zpool *zswap_find_zpool(struct zswap_entry *entry) */ static void zswap_free_entry(struct zswap_entry *entry) { - if (entry->objcg) { - obj_cgroup_uncharge_zswap(entry->objcg, entry->length); - obj_cgroup_put(entry->objcg); - } if (!entry->length) atomic_dec(&zswap_same_filled_pages); else { @@ -548,6 +544,10 @@ static void zswap_free_entry(struct zswap_entry *entry) atomic_dec(&entry->pool->nr_stored); zswap_pool_put(entry->pool); } + if (entry->objcg) { + obj_cgroup_uncharge_zswap(entry->objcg, entry->length); + obj_cgroup_put(entry->objcg); + } zswap_entry_cache_free(entry); atomic_dec(&zswap_stored_pages); zswap_update_total_size(); From 23469997cda3afa5153e3cd40de94680e67eb2b9 Mon Sep 17 00:00:00 2001 From: John Moon Date: Wed, 31 Jan 2024 03:43:18 +0000 Subject: [PATCH 404/707] mailmap: switch email address for John Moon Add current email address as QUIC email is no longer active. Link: https://lkml.kernel.org/r/20240131034311.46706-1-john@jmoon.dev Signed-off-by: John Moon Acked-by: Trilok Soni Cc: Elliot Berman Signed-off-by: Andrew Morton --- .mailmap | 1 + 1 file changed, 1 insertion(+) diff --git a/.mailmap b/.mailmap index 04998f7bda8181..8ae00bd3708a73 100644 --- a/.mailmap +++ b/.mailmap @@ -289,6 +289,7 @@ Johan Hovold John Crispin John Fastabend John Keeping +John Moon John Paul Adrian Glaubitz John Stultz From 3cc3d209406993d20887a521c8a27f7388eee66c Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Tue, 30 Jan 2024 22:04:18 +0100 Subject: [PATCH 405/707] fs,hugetlb: fix NULL pointer dereference in hugetlbs_fill_super When configuring a hugetlb filesystem via the fsconfig() syscall, there is a possible NULL dereference in hugetlbfs_fill_super() caused by assigning NULL to ctx->hstate in hugetlbfs_parse_param() when the requested pagesize is non valid. E.g: Taking the following steps: fd = fsopen("hugetlbfs", FSOPEN_CLOEXEC); fsconfig(fd, FSCONFIG_SET_STRING, "pagesize", "1024", 0); fsconfig(fd, FSCONFIG_CMD_CREATE, NULL, NULL, 0); Given that the requested "pagesize" is invalid, ctxt->hstate will be replaced with NULL, losing its previous value, and we will print an error: ... ... case Opt_pagesize: ps = memparse(param->string, &rest); ctx->hstate = h; if (!ctx->hstate) { pr_err("Unsupported page size %lu MB\n", ps / SZ_1M); return -EINVAL; } return 0; ... ... This is a problem because later on, we will dereference ctxt->hstate in hugetlbfs_fill_super() ... ... sb->s_blocksize = huge_page_size(ctx->hstate); ... ... Causing below Oops. Fix this by replacing cxt->hstate value only when then pagesize is known to be valid. kernel: hugetlbfs: Unsupported page size 0 MB kernel: BUG: kernel NULL pointer dereference, address: 0000000000000028 kernel: #PF: supervisor read access in kernel mode kernel: #PF: error_code(0x0000) - not-present page kernel: PGD 800000010f66c067 P4D 800000010f66c067 PUD 1b22f8067 PMD 0 kernel: Oops: 0000 [#1] PREEMPT SMP PTI kernel: CPU: 4 PID: 5659 Comm: syscall Tainted: G E 6.8.0-rc2-default+ #22 5a47c3fef76212addcc6eb71344aabc35190ae8f kernel: Hardware name: Intel Corp. GROVEPORT/GROVEPORT, BIOS GVPRCRB1.86B.0016.D04.1705030402 05/03/2017 kernel: RIP: 0010:hugetlbfs_fill_super+0xb4/0x1a0 kernel: Code: 48 8b 3b e8 3e c6 ed ff 48 85 c0 48 89 45 20 0f 84 d6 00 00 00 48 b8 ff ff ff ff ff ff ff 7f 4c 89 e7 49 89 44 24 20 48 8b 03 <8b> 48 28 b8 00 10 00 00 48 d3 e0 49 89 44 24 18 48 8b 03 8b 40 28 kernel: RSP: 0018:ffffbe9960fcbd48 EFLAGS: 00010246 kernel: RAX: 0000000000000000 RBX: ffff9af5272ae780 RCX: 0000000000372004 kernel: RDX: ffffffffffffffff RSI: ffffffffffffffff RDI: ffff9af555e9b000 kernel: RBP: ffff9af52ee66b00 R08: 0000000000000040 R09: 0000000000370004 kernel: R10: ffffbe9960fcbd48 R11: 0000000000000040 R12: ffff9af555e9b000 kernel: R13: ffffffffa66b86c0 R14: ffff9af507d2f400 R15: ffff9af507d2f400 kernel: FS: 00007ffbc0ba4740(0000) GS:ffff9b0bd7000000(0000) knlGS:0000000000000000 kernel: CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 kernel: CR2: 0000000000000028 CR3: 00000001b1ee0000 CR4: 00000000001506f0 kernel: Call Trace: kernel: kernel: ? __die_body+0x1a/0x60 kernel: ? page_fault_oops+0x16f/0x4a0 kernel: ? search_bpf_extables+0x65/0x70 kernel: ? fixup_exception+0x22/0x310 kernel: ? exc_page_fault+0x69/0x150 kernel: ? asm_exc_page_fault+0x22/0x30 kernel: ? __pfx_hugetlbfs_fill_super+0x10/0x10 kernel: ? hugetlbfs_fill_super+0xb4/0x1a0 kernel: ? hugetlbfs_fill_super+0x28/0x1a0 kernel: ? __pfx_hugetlbfs_fill_super+0x10/0x10 kernel: vfs_get_super+0x40/0xa0 kernel: ? __pfx_bpf_lsm_capable+0x10/0x10 kernel: vfs_get_tree+0x25/0xd0 kernel: vfs_cmd_create+0x64/0xe0 kernel: __x64_sys_fsconfig+0x395/0x410 kernel: do_syscall_64+0x80/0x160 kernel: ? syscall_exit_to_user_mode+0x82/0x240 kernel: ? do_syscall_64+0x8d/0x160 kernel: ? syscall_exit_to_user_mode+0x82/0x240 kernel: ? do_syscall_64+0x8d/0x160 kernel: ? exc_page_fault+0x69/0x150 kernel: entry_SYSCALL_64_after_hwframe+0x6e/0x76 kernel: RIP: 0033:0x7ffbc0cb87c9 kernel: Code: 00 90 90 90 90 90 90 90 90 90 90 90 90 90 90 66 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 97 96 0d 00 f7 d8 64 89 01 48 kernel: RSP: 002b:00007ffc29d2f388 EFLAGS: 00000206 ORIG_RAX: 00000000000001af kernel: RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007ffbc0cb87c9 kernel: RDX: 0000000000000000 RSI: 0000000000000006 RDI: 0000000000000003 kernel: RBP: 00007ffc29d2f3b0 R08: 0000000000000000 R09: 0000000000000000 kernel: R10: 0000000000000000 R11: 0000000000000206 R12: 0000000000000000 kernel: R13: 00007ffc29d2f4c0 R14: 0000000000000000 R15: 0000000000000000 kernel: kernel: Modules linked in: rpcsec_gss_krb5(E) auth_rpcgss(E) nfsv4(E) dns_resolver(E) nfs(E) lockd(E) grace(E) sunrpc(E) netfs(E) af_packet(E) bridge(E) stp(E) llc(E) iscsi_ibft(E) iscsi_boot_sysfs(E) intel_rapl_msr(E) intel_rapl_common(E) iTCO_wdt(E) intel_pmc_bxt(E) sb_edac(E) iTCO_vendor_support(E) x86_pkg_temp_thermal(E) intel_powerclamp(E) coretemp(E) kvm_intel(E) rfkill(E) ipmi_ssif(E) kvm(E) acpi_ipmi(E) irqbypass(E) pcspkr(E) igb(E) ipmi_si(E) mei_me(E) i2c_i801(E) joydev(E) intel_pch_thermal(E) i2c_smbus(E) dca(E) lpc_ich(E) mei(E) ipmi_devintf(E) ipmi_msghandler(E) acpi_pad(E) tiny_power_button(E) button(E) fuse(E) efi_pstore(E) configfs(E) ip_tables(E) x_tables(E) ext4(E) mbcache(E) jbd2(E) hid_generic(E) usbhid(E) sd_mod(E) t10_pi(E) crct10dif_pclmul(E) crc32_pclmul(E) crc32c_intel(E) polyval_clmulni(E) ahci(E) xhci_pci(E) polyval_generic(E) gf128mul(E) ghash_clmulni_intel(E) sha512_ssse3(E) sha256_ssse3(E) xhci_pci_renesas(E) libahci(E) ehci_pci(E) sha1_ssse3(E) xhci_hcd(E) ehci_hcd(E) libata(E) kernel: mgag200(E) i2c_algo_bit(E) usbcore(E) wmi(E) sg(E) dm_multipath(E) dm_mod(E) scsi_dh_rdac(E) scsi_dh_emc(E) scsi_dh_alua(E) scsi_mod(E) scsi_common(E) aesni_intel(E) crypto_simd(E) cryptd(E) kernel: Unloaded tainted modules: acpi_cpufreq(E):1 fjes(E):1 kernel: CR2: 0000000000000028 kernel: ---[ end trace 0000000000000000 ]--- kernel: RIP: 0010:hugetlbfs_fill_super+0xb4/0x1a0 kernel: Code: 48 8b 3b e8 3e c6 ed ff 48 85 c0 48 89 45 20 0f 84 d6 00 00 00 48 b8 ff ff ff ff ff ff ff 7f 4c 89 e7 49 89 44 24 20 48 8b 03 <8b> 48 28 b8 00 10 00 00 48 d3 e0 49 89 44 24 18 48 8b 03 8b 40 28 kernel: RSP: 0018:ffffbe9960fcbd48 EFLAGS: 00010246 kernel: RAX: 0000000000000000 RBX: ffff9af5272ae780 RCX: 0000000000372004 kernel: RDX: ffffffffffffffff RSI: ffffffffffffffff RDI: ffff9af555e9b000 kernel: RBP: ffff9af52ee66b00 R08: 0000000000000040 R09: 0000000000370004 kernel: R10: ffffbe9960fcbd48 R11: 0000000000000040 R12: ffff9af555e9b000 kernel: R13: ffffffffa66b86c0 R14: ffff9af507d2f400 R15: ffff9af507d2f400 kernel: FS: 00007ffbc0ba4740(0000) GS:ffff9b0bd7000000(0000) knlGS:0000000000000000 kernel: CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 kernel: CR2: 0000000000000028 CR3: 00000001b1ee0000 CR4: 00000000001506f0 Link: https://lkml.kernel.org/r/20240130210418.3771-1-osalvador@suse.de Signed-off-by: Michal Hocko Signed-off-by: Oscar Salvador Acked-by: Muchun Song Cc: Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index ee13c2ca8ad293..d746866ae3b6ba 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -1365,6 +1365,7 @@ static int hugetlbfs_parse_param(struct fs_context *fc, struct fs_parameter *par { struct hugetlbfs_fs_context *ctx = fc->fs_private; struct fs_parse_result result; + struct hstate *h; char *rest; unsigned long ps; int opt; @@ -1409,11 +1410,12 @@ static int hugetlbfs_parse_param(struct fs_context *fc, struct fs_parameter *par case Opt_pagesize: ps = memparse(param->string, &rest); - ctx->hstate = size_to_hstate(ps); - if (!ctx->hstate) { + h = size_to_hstate(ps); + if (!h) { pr_err("Unsupported page size %lu MB\n", ps / SZ_1M); return -EINVAL; } + ctx->hstate = h; return 0; case Opt_min_size: From bbac2bacc15831e9f92dbf7deabe8192c2d8ea92 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Sun, 28 Jan 2024 13:28:49 +0000 Subject: [PATCH 406/707] mm/zswap: don't return LRU_SKIP if we have dropped lru lock LRU_SKIP can only be returned if we don't ever dropped lru lock, or we need to return LRU_RETRY to restart from the head of lru list. Otherwise, the iteration might continue from a cursor position that was freed while the locks were dropped. Actually we may need to introduce another LRU_STOP to really terminate the ongoing shrinking scan process, when we encounter a warm page already in the swap cache. The current list_lru implementation doesn't have this function to early break from __list_lru_walk_one. Link: https://lkml.kernel.org/r/20240126-zswap-writeback-race-v2-1-b10479847099@bytedance.com Fixes: b5ba474f3f51 ("zswap: shrink zswap pool based on memory pressure") Signed-off-by: Chengming Zhou Acked-by: Johannes Weiner Reviewed-by: Nhat Pham Cc: Chris Li Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- mm/zswap.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 0a94b197ed32e1..350dd2fc815994 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -895,10 +895,8 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o * into the warmer region. We should terminate shrinking (if we're in the dynamic * shrinker context). */ - if (writeback_result == -EEXIST && encountered_page_in_swapcache) { - ret = LRU_SKIP; + if (writeback_result == -EEXIST && encountered_page_in_swapcache) *encountered_page_in_swapcache = true; - } goto put_unlock; } From a9243ad48dad5605fadeabdf5d146576e1297aa0 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Thu, 25 Jan 2024 08:51:27 +0000 Subject: [PATCH 407/707] mm: zswap: fix missing folio cleanup in writeback race path In zswap_writeback_entry(), after we get a folio from __read_swap_cache_async(), we grab the tree lock again to check that the swap entry was not invalidated and recycled. If it was, we delete the folio we just added to the swap cache and exit. However, __read_swap_cache_async() returns the folio locked when it is newly allocated, which is always true for this path, and the folio is ref'd. Make sure to unlock and put the folio before returning. This was discovered by code inspection, probably because this path handles a race condition that should not happen often, and the bug would not crash the system, it will only strand the folio indefinitely. Link: https://lkml.kernel.org/r/20240125085127.1327013-1-yosryahmed@google.com Fixes: 04fc7816089c ("mm: fix zswap writeback race condition") Signed-off-by: Yosry Ahmed Reviewed-by: Chengming Zhou Acked-by: Johannes Weiner Reviewed-by: Nhat Pham Cc: Domenico Cerasuolo Cc: Signed-off-by: Andrew Morton --- mm/zswap.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/zswap.c b/mm/zswap.c index 350dd2fc815994..d2423247acfd64 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1440,6 +1440,8 @@ static int zswap_writeback_entry(struct zswap_entry *entry, if (zswap_rb_search(&tree->rbroot, swp_offset(entry->swpentry)) != entry) { spin_unlock(&tree->lock); delete_from_swap_cache(folio); + folio_unlock(folio); + folio_put(folio); return -ENOMEM; } spin_unlock(&tree->lock); From 064fd31ffbe142d57dc698081f99de5b78979e68 Mon Sep 17 00:00:00 2001 From: Kalesh Singh Date: Tue, 9 Jan 2024 17:22:33 -0800 Subject: [PATCH 408/707] mm/cma: fix placement of trace_cma_alloc_start/finish The current placement of trace_cma_alloc_start/finish misses the fail cases: !cma || !cma->count || !cma->bitmap. trace_cma_alloc_finish is also not emitted for the failure case where bitmap_count > bitmap_maxno. Fix these missed cases by moving the start event before the failure checks and moving the finish event to the out label. Link: https://lkml.kernel.org/r/20240110012234.3793639-1-kaleshsingh@google.com Fixes: 7bc1aec5e287 ("mm: cma: add trace events for CMA alloc perf testing") Signed-off-by: Kalesh Singh Cc: Minchan Kim Cc: Liam Mark Signed-off-by: Andrew Morton --- mm/cma.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/cma.c b/mm/cma.c index 7c09c47e530bf6..e12cf41d83549a 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -436,6 +436,9 @@ struct page *cma_alloc(struct cma *cma, unsigned long count, unsigned long i; struct page *page = NULL; int ret = -ENOMEM; + const char *name = cma ? cma->name : NULL; + + trace_cma_alloc_start(name, count, align); if (!cma || !cma->count || !cma->bitmap) goto out; @@ -446,8 +449,6 @@ struct page *cma_alloc(struct cma *cma, unsigned long count, if (!count) goto out; - trace_cma_alloc_start(cma->name, count, align); - mask = cma_bitmap_aligned_mask(cma, align); offset = cma_bitmap_aligned_offset(cma, align); bitmap_maxno = cma_bitmap_maxno(cma); @@ -496,8 +497,6 @@ struct page *cma_alloc(struct cma *cma, unsigned long count, start = bitmap_no + mask + 1; } - trace_cma_alloc_finish(cma->name, pfn, page, count, align, ret); - /* * CMA can allocate multiple page blocks, which results in different * blocks being marked with different tags. Reset the tags to ignore @@ -516,6 +515,7 @@ struct page *cma_alloc(struct cma *cma, unsigned long count, pr_debug("%s(): returned %p\n", __func__, page); out: + trace_cma_alloc_finish(name, pfn, page, count, align, ret); if (page) { count_vm_event(CMA_ALLOC_SUCCESS); cma_sysfs_account_success_pages(cma, count); From c3f7c49c1fd3cea9b486dd38e20082dab4b3904a Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Tue, 9 Jan 2024 14:31:19 -0800 Subject: [PATCH 409/707] maple_tree: fix comment describing mas_node_count_gfp() The function description comment for mas_node_count_gfp() mistakingly refers to the function as mas_node_count(). Change it to refer to the correct function. Link: https://lkml.kernel.org/r/20240109223119.162357-1-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Liam R. Howlett Cc: Peng Zhang Cc: Sidhartha Kumar Signed-off-by: Andrew Morton --- lib/maple_tree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 6f241bb3879920..7b161802860bdb 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -1307,8 +1307,8 @@ static inline void mas_free(struct ma_state *mas, struct maple_enode *used) } /* - * mas_node_count() - Check if enough nodes are allocated and request more if - * there is not enough nodes. + * mas_node_count_gfp() - Check if enough nodes are allocated and request more + * if there is not enough nodes. * @mas: The maple state * @count: The number of nodes needed * @gfp: the gfp flags From 372abffc7919cffd734d993cc5cc33c0f60b9175 Mon Sep 17 00:00:00 2001 From: Sumanth Korikkar Date: Mon, 8 Jan 2024 14:27:43 +0100 Subject: [PATCH 410/707] mm/memory_hotplug: introduce MEM_PREPARE_ONLINE/MEM_FINISH_OFFLINE notifiers Patch series "implement "memmap on memory" feature on s390". This series provides "memmap on memory" support on s390 platform. "memmap on memory" allows struct pages array to be allocated from the hotplugged memory range instead of allocating it from main system memory. s390 currently preallocates struct pages array for all potentially possible memory, which ensures memory onlining always succeeds, but with the cost of significant memory consumption from the available system memory during boottime. In certain extreme configuration, this could lead to ipl failure. "memmap on memory" ensures struct pages array are populated from self contained hotplugged memory range instead of depleting the available system memory and this could eliminate ipl failure on s390 platform. On other platforms, system might go OOM when the physically hotplugged memory depletes the available memory before it is onlined. Hence, "memmap on memory" feature was introduced as described in commit a08a2ae34613 ("mm,memory_hotplug: allocate memmap from the added memory range"). Unlike other architectures, s390 memory blocks are not physically accessible until it is online. To make it physically accessible two new memory notifiers MEM_PREPARE_ONLINE / MEM_FINISH_OFFLINE are added and this notifier lets the hypervisor inform that the memory should be made physically accessible. This allows for "memmap on memory" initialization during memory hotplug onlining phase, which is performed before calling MEM_GOING_ONLINE notifier. Patch 1 introduces MEM_PREPARE_ONLINE/MEM_FINISH_OFFLINE memory notifiers to prepare the transition of memory to and from a physically accessible state. New mhp_flag MHP_OFFLINE_INACCESSIBLE is introduced to ensure altmap cannot be written when adding memory - before it is set online. This enhancement is crucial for implementing the "memmap on memory" feature for s390 in a subsequent patch. Patches 2 allocates vmemmap pages from self-contained memory range for s390. It allocates memory map (struct pages array) from the hotplugged memory range, rather than using system memory by passing altmap to vmemmap functions. Patch 3 removes unhandled memory notifier types on s390. Patch 4 implements MEM_PREPARE_ONLINE/MEM_FINISH_OFFLINE memory notifiers on s390. MEM_PREPARE_ONLINE memory notifier makes memory block physical accessible via sclp assign command. The notifier ensures self-contained memory maps are accessible and hence enabling the "memmap on memory" on s390. MEM_FINISH_OFFLINE memory notifier shifts the memory block to an inaccessible state via sclp unassign command. Patch 5 finally enables MHP_MEMMAP_ON_MEMORY on s390. This patch (of 5): Introduce MEM_PREPARE_ONLINE/MEM_FINISH_OFFLINE memory notifiers to prepare the transition of memory to and from a physically accessible state. This enhancement is crucial for implementing the "memmap on memory" feature for s390 in a subsequent patch. Platforms such as x86 can support physical memory hotplug via ACPI. When there is physical memory hotplug, ACPI event leads to the memory addition with the following callchain: acpi_memory_device_add() -> acpi_memory_enable_device() -> __add_memory() After this, the hotplugged memory is physically accessible, and altmap support prepared, before the "memmap on memory" initialization in memory_block_online() is called. On s390, memory hotplug works in a different way. The available hotplug memory has to be defined upfront in the hypervisor, but it is made physically accessible only when the user sets it online via sysfs, currently in the MEM_GOING_ONLINE notifier. This is too late and "memmap on memory" initialization is performed before calling MEM_GOING_ONLINE notifier. During the memory hotplug addition phase, altmap support is prepared and during the memory onlining phase s390 requires memory to be physically accessible and then subsequently initiate the "memmap on memory" initialization process. The memory provider will handle new MEM_PREPARE_ONLINE / MEM_FINISH_OFFLINE notifications and make the memory accessible. The mhp_flag MHP_OFFLINE_INACCESSIBLE is introduced and is relevant when used along with MHP_MEMMAP_ON_MEMORY, because the altmap cannot be written (e.g., poisoned) when adding memory -- before it is set online. This allows for adding memory with an altmap that is not currently made available by a hypervisor. When onlining that memory, the hypervisor can be instructed to make that memory accessible via the new notifiers and the onlining phase will not require any memory allocations, which is helpful in low-memory situations. All architectures ignore unknown memory notifiers. Therefore, the introduction of these new notifiers does not result in any functional modifications across architectures. Link: https://lkml.kernel.org/r/20240108132747.3238763-1-sumanthk@linux.ibm.com Link: https://lkml.kernel.org/r/20240108132747.3238763-2-sumanthk@linux.ibm.com Signed-off-by: Sumanth Korikkar Suggested-by: Gerald Schaefer Suggested-by: David Hildenbrand Acked-by: David Hildenbrand Cc: Alexander Gordeev Cc: Aneesh Kumar K.V Cc: Anshuman Khandual Cc: Heiko Carstens Cc: Michal Hocko Cc: Oscar Salvador Cc: Vasily Gorbik Signed-off-by: Andrew Morton --- drivers/base/memory.c | 23 ++++++++++++++++++++++- include/linux/memory.h | 9 +++++++++ include/linux/memory_hotplug.h | 18 +++++++++++++++++- include/linux/memremap.h | 1 + mm/memory_hotplug.c | 17 ++++++++++++++--- mm/sparse.c | 3 ++- 6 files changed, 65 insertions(+), 6 deletions(-) diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 14f964a7719bd0..c0436f46cfb701 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -188,6 +188,7 @@ static int memory_block_online(struct memory_block *mem) unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; unsigned long nr_vmemmap_pages = 0; + struct memory_notify arg; struct zone *zone; int ret; @@ -207,9 +208,19 @@ static int memory_block_online(struct memory_block *mem) if (mem->altmap) nr_vmemmap_pages = mem->altmap->free; + arg.altmap_start_pfn = start_pfn; + arg.altmap_nr_pages = nr_vmemmap_pages; + arg.start_pfn = start_pfn + nr_vmemmap_pages; + arg.nr_pages = nr_pages - nr_vmemmap_pages; mem_hotplug_begin(); + ret = memory_notify(MEM_PREPARE_ONLINE, &arg); + ret = notifier_to_errno(ret); + if (ret) + goto out_notifier; + if (nr_vmemmap_pages) { - ret = mhp_init_memmap_on_memory(start_pfn, nr_vmemmap_pages, zone); + ret = mhp_init_memmap_on_memory(start_pfn, nr_vmemmap_pages, + zone, mem->altmap->inaccessible); if (ret) goto out; } @@ -231,7 +242,11 @@ static int memory_block_online(struct memory_block *mem) nr_vmemmap_pages); mem->zone = zone; + mem_hotplug_done(); + return ret; out: + memory_notify(MEM_FINISH_OFFLINE, &arg); +out_notifier: mem_hotplug_done(); return ret; } @@ -244,6 +259,7 @@ static int memory_block_offline(struct memory_block *mem) unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; unsigned long nr_vmemmap_pages = 0; + struct memory_notify arg; int ret; if (!mem->zone) @@ -275,6 +291,11 @@ static int memory_block_offline(struct memory_block *mem) mhp_deinit_memmap_on_memory(start_pfn, nr_vmemmap_pages); mem->zone = NULL; + arg.altmap_start_pfn = start_pfn; + arg.altmap_nr_pages = nr_vmemmap_pages; + arg.start_pfn = start_pfn + nr_vmemmap_pages; + arg.nr_pages = nr_pages - nr_vmemmap_pages; + memory_notify(MEM_FINISH_OFFLINE, &arg); out: mem_hotplug_done(); return ret; diff --git a/include/linux/memory.h b/include/linux/memory.h index f53cfdaaaa4166..939a16bd5cea15 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -96,8 +96,17 @@ int set_memory_block_size_order(unsigned int order); #define MEM_GOING_ONLINE (1<<3) #define MEM_CANCEL_ONLINE (1<<4) #define MEM_CANCEL_OFFLINE (1<<5) +#define MEM_PREPARE_ONLINE (1<<6) +#define MEM_FINISH_OFFLINE (1<<7) struct memory_notify { + /* + * The altmap_start_pfn and altmap_nr_pages fields are designated for + * specifying the altmap range and are exclusively intended for use in + * MEM_PREPARE_ONLINE/MEM_FINISH_OFFLINE notifiers. + */ + unsigned long altmap_start_pfn; + unsigned long altmap_nr_pages; unsigned long start_pfn; unsigned long nr_pages; int status_change_nid_normal; diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 7d207658349416..ee00015575aab3 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -106,6 +106,22 @@ typedef int __bitwise mhp_t; * implies the node id (nid). */ #define MHP_NID_IS_MGID ((__force mhp_t)BIT(2)) +/* + * The hotplugged memory is completely inaccessible while the memory is + * offline. The memory provider will handle MEM_PREPARE_ONLINE / + * MEM_FINISH_OFFLINE notifications and make the memory accessible. + * + * This flag is only relevant when used along with MHP_MEMMAP_ON_MEMORY, + * because the altmap cannot be written (e.g., poisoned) when adding + * memory -- before it is set online. + * + * This allows for adding memory with an altmap that is not currently + * made available by a hypervisor. When onlining that memory, the + * hypervisor can be instructed to make that memory available, and + * the onlining phase will not require any memory allocations, which is + * helpful in low-memory situations. + */ +#define MHP_OFFLINE_INACCESSIBLE ((__force mhp_t)BIT(3)) /* * Extended parameters for memory hotplug: @@ -154,7 +170,7 @@ extern void adjust_present_page_count(struct page *page, long nr_pages); /* VM interface that may be used by firmware interface */ extern int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages, - struct zone *zone); + struct zone *zone, bool mhp_off_inaccessible); extern void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages); extern int online_pages(unsigned long pfn, unsigned long nr_pages, struct zone *zone, struct memory_group *group); diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 744c830f4b132c..9837f3e6fb9582 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -25,6 +25,7 @@ struct vmem_altmap { unsigned long free; unsigned long align; unsigned long alloc; + bool inaccessible; }; /* diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 21890994c1d3cc..707027f691503f 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1087,7 +1087,7 @@ void adjust_present_page_count(struct page *page, struct memory_group *group, } int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages, - struct zone *zone) + struct zone *zone, bool mhp_off_inaccessible) { unsigned long end_pfn = pfn + nr_pages; int ret, i; @@ -1096,6 +1096,15 @@ int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages, if (ret) return ret; + /* + * Memory block is accessible at this stage and hence poison the struct + * pages now. If the memory block is accessible during memory hotplug + * addition phase, then page poisining is already performed in + * sparse_add_section(). + */ + if (mhp_off_inaccessible) + page_init_poison(pfn_to_page(pfn), sizeof(struct page) * nr_pages); + move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_UNMOVABLE); for (i = 0; i < nr_pages; i++) @@ -1415,7 +1424,7 @@ static void __ref remove_memory_blocks_and_altmaps(u64 start, u64 size) } static int create_altmaps_and_memory_blocks(int nid, struct memory_group *group, - u64 start, u64 size) + u64 start, u64 size, mhp_t mhp_flags) { unsigned long memblock_size = memory_block_size_bytes(); u64 cur_start; @@ -1431,6 +1440,8 @@ static int create_altmaps_and_memory_blocks(int nid, struct memory_group *group, }; mhp_altmap.free = memory_block_memmap_on_memory_pages(); + if (mhp_flags & MHP_OFFLINE_INACCESSIBLE) + mhp_altmap.inaccessible = true; params.altmap = kmemdup(&mhp_altmap, sizeof(struct vmem_altmap), GFP_KERNEL); if (!params.altmap) { @@ -1516,7 +1527,7 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) */ if ((mhp_flags & MHP_MEMMAP_ON_MEMORY) && mhp_supports_memmap_on_memory(memory_block_size_bytes())) { - ret = create_altmaps_and_memory_blocks(nid, group, start, size); + ret = create_altmaps_and_memory_blocks(nid, group, start, size, mhp_flags); if (ret) goto error; } else { diff --git a/mm/sparse.c b/mm/sparse.c index 338cf946dee8de..aed0951b87fa04 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -908,7 +908,8 @@ int __meminit sparse_add_section(int nid, unsigned long start_pfn, * Poison uninitialized struct pages in order to catch invalid flags * combinations. */ - page_init_poison(memmap, sizeof(struct page) * nr_pages); + if (!altmap || !altmap->inaccessible) + page_init_poison(memmap, sizeof(struct page) * nr_pages); ms = __nr_to_section(section_nr); set_section_nid(section_nr, nid); From 18d6f1015a1913132d32f5cfa84aa301ad7e5f84 Mon Sep 17 00:00:00 2001 From: Sumanth Korikkar Date: Mon, 8 Jan 2024 14:27:44 +0100 Subject: [PATCH 411/707] s390/mm: allocate vmemmap pages from self-contained memory range Allocate memory map (struct pages array) from the hotplugged memory range, rather than using system memory. The change addresses the issue where standby memory, when configured to be much larger than online memory, could potentially lead to ipl failure due to memory map allocation from online memory. For example, 16MB of memory map allocation is needed for a memory block size of 1GB and when standby memory is configured much larger than online memory, this could lead to ipl failure. To address this issue, the solution involves introducing "memmap on memory" using the vmem_altmap structure on s390. Architectures that want to implement it should pass the altmap to the vmemmap_populate() function and its associated callchain. This enhancement is discussed in commit 4b94ffdc4163 ("x86, mm: introduce vmem_altmap to augment vmemmap_populate()") Provide "memmap on memory" support for s390 by passing the altmap in vmemmap_populate() and its callchain. The allocation path is described as follows: * When altmap is NULL in vmemmap_populate(), memory map allocation occurs using the existing vmemmap_alloc_block_buf(). * When altmap is not NULL in vmemmap_populate(), memory map allocation still uses vmemmap_alloc_block_buf(), but this function internally calls altmap_alloc_block_buf(). For deallocation, the process is outlined as follows: * When altmap is NULL in vmemmap_free(), memory map deallocation happens through free_pages(). * When altmap is not NULL in vmemmap_free(), memory map deallocation occurs via vmem_altmap_free(). While memory map allocation is primarily handled through the self-contained memory map range, there might still be a small amount of system memory allocation required for vmemmap pagetables. To mitigate this impact, this feature will be limited to machines with EDAT1 support. Link: https://lkml.kernel.org/r/20240108132747.3238763-3-sumanthk@linux.ibm.com Reviewed-by: Gerald Schaefer Signed-off-by: Sumanth Korikkar Cc: Alexander Gordeev Cc: Aneesh Kumar K.V Cc: Anshuman Khandual Cc: David Hildenbrand Cc: Heiko Carstens Cc: Michal Hocko Cc: Oscar Salvador Cc: Vasily Gorbik Signed-off-by: Andrew Morton --- arch/s390/mm/init.c | 3 --- arch/s390/mm/vmem.c | 62 +++++++++++++++++++++++++-------------------- 2 files changed, 35 insertions(+), 30 deletions(-) diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index 43e612bc2bcd34..8d9a60ccb7771a 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -281,9 +281,6 @@ int arch_add_memory(int nid, u64 start, u64 size, unsigned long size_pages = PFN_DOWN(size); int rc; - if (WARN_ON_ONCE(params->altmap)) - return -EINVAL; - if (WARN_ON_ONCE(params->pgprot.pgprot != PAGE_KERNEL.pgprot)) return -EINVAL; diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c index 186a020857cf6a..eb100479f7bec4 100644 --- a/arch/s390/mm/vmem.c +++ b/arch/s390/mm/vmem.c @@ -33,8 +33,12 @@ static void __ref *vmem_alloc_pages(unsigned int order) return memblock_alloc(size, size); } -static void vmem_free_pages(unsigned long addr, int order) +static void vmem_free_pages(unsigned long addr, int order, struct vmem_altmap *altmap) { + if (altmap) { + vmem_altmap_free(altmap, 1 << order); + return; + } /* We don't expect boot memory to be removed ever. */ if (!slab_is_available() || WARN_ON_ONCE(PageReserved(virt_to_page((void *)addr)))) @@ -156,7 +160,8 @@ static bool vmemmap_unuse_sub_pmd(unsigned long start, unsigned long end) /* __ref: we'll only call vmemmap_alloc_block() via vmemmap_populate() */ static int __ref modify_pte_table(pmd_t *pmd, unsigned long addr, - unsigned long end, bool add, bool direct) + unsigned long end, bool add, bool direct, + struct vmem_altmap *altmap) { unsigned long prot, pages = 0; int ret = -ENOMEM; @@ -172,11 +177,11 @@ static int __ref modify_pte_table(pmd_t *pmd, unsigned long addr, if (pte_none(*pte)) continue; if (!direct) - vmem_free_pages((unsigned long) pfn_to_virt(pte_pfn(*pte)), 0); + vmem_free_pages((unsigned long)pfn_to_virt(pte_pfn(*pte)), get_order(PAGE_SIZE), altmap); pte_clear(&init_mm, addr, pte); } else if (pte_none(*pte)) { if (!direct) { - void *new_page = vmemmap_alloc_block(PAGE_SIZE, NUMA_NO_NODE); + void *new_page = vmemmap_alloc_block_buf(PAGE_SIZE, NUMA_NO_NODE, altmap); if (!new_page) goto out; @@ -213,7 +218,8 @@ static void try_free_pte_table(pmd_t *pmd, unsigned long start) /* __ref: we'll only call vmemmap_alloc_block() via vmemmap_populate() */ static int __ref modify_pmd_table(pud_t *pud, unsigned long addr, - unsigned long end, bool add, bool direct) + unsigned long end, bool add, bool direct, + struct vmem_altmap *altmap) { unsigned long next, prot, pages = 0; int ret = -ENOMEM; @@ -234,11 +240,11 @@ static int __ref modify_pmd_table(pud_t *pud, unsigned long addr, if (IS_ALIGNED(addr, PMD_SIZE) && IS_ALIGNED(next, PMD_SIZE)) { if (!direct) - vmem_free_pages(pmd_deref(*pmd), get_order(PMD_SIZE)); + vmem_free_pages(pmd_deref(*pmd), get_order(PMD_SIZE), altmap); pmd_clear(pmd); pages++; } else if (!direct && vmemmap_unuse_sub_pmd(addr, next)) { - vmem_free_pages(pmd_deref(*pmd), get_order(PMD_SIZE)); + vmem_free_pages(pmd_deref(*pmd), get_order(PMD_SIZE), altmap); pmd_clear(pmd); } continue; @@ -261,7 +267,7 @@ static int __ref modify_pmd_table(pud_t *pud, unsigned long addr, * page tables since vmemmap_populate gets * called for each section separately. */ - new_page = vmemmap_alloc_block(PMD_SIZE, NUMA_NO_NODE); + new_page = vmemmap_alloc_block_buf(PMD_SIZE, NUMA_NO_NODE, altmap); if (new_page) { set_pmd(pmd, __pmd(__pa(new_page) | prot)); if (!IS_ALIGNED(addr, PMD_SIZE) || @@ -280,7 +286,7 @@ static int __ref modify_pmd_table(pud_t *pud, unsigned long addr, vmemmap_use_sub_pmd(addr, next); continue; } - ret = modify_pte_table(pmd, addr, next, add, direct); + ret = modify_pte_table(pmd, addr, next, add, direct, altmap); if (ret) goto out; if (!add) @@ -302,12 +308,12 @@ static void try_free_pmd_table(pud_t *pud, unsigned long start) for (i = 0; i < PTRS_PER_PMD; i++, pmd++) if (!pmd_none(*pmd)) return; - vmem_free_pages(pud_deref(*pud), CRST_ALLOC_ORDER); + vmem_free_pages(pud_deref(*pud), CRST_ALLOC_ORDER, NULL); pud_clear(pud); } static int modify_pud_table(p4d_t *p4d, unsigned long addr, unsigned long end, - bool add, bool direct) + bool add, bool direct, struct vmem_altmap *altmap) { unsigned long next, prot, pages = 0; int ret = -ENOMEM; @@ -347,7 +353,7 @@ static int modify_pud_table(p4d_t *p4d, unsigned long addr, unsigned long end, } else if (pud_large(*pud)) { continue; } - ret = modify_pmd_table(pud, addr, next, add, direct); + ret = modify_pmd_table(pud, addr, next, add, direct, altmap); if (ret) goto out; if (!add) @@ -370,12 +376,12 @@ static void try_free_pud_table(p4d_t *p4d, unsigned long start) if (!pud_none(*pud)) return; } - vmem_free_pages(p4d_deref(*p4d), CRST_ALLOC_ORDER); + vmem_free_pages(p4d_deref(*p4d), CRST_ALLOC_ORDER, NULL); p4d_clear(p4d); } static int modify_p4d_table(pgd_t *pgd, unsigned long addr, unsigned long end, - bool add, bool direct) + bool add, bool direct, struct vmem_altmap *altmap) { unsigned long next; int ret = -ENOMEM; @@ -394,7 +400,7 @@ static int modify_p4d_table(pgd_t *pgd, unsigned long addr, unsigned long end, goto out; p4d_populate(&init_mm, p4d, pud); } - ret = modify_pud_table(p4d, addr, next, add, direct); + ret = modify_pud_table(p4d, addr, next, add, direct, altmap); if (ret) goto out; if (!add) @@ -415,12 +421,12 @@ static void try_free_p4d_table(pgd_t *pgd, unsigned long start) if (!p4d_none(*p4d)) return; } - vmem_free_pages(pgd_deref(*pgd), CRST_ALLOC_ORDER); + vmem_free_pages(pgd_deref(*pgd), CRST_ALLOC_ORDER, NULL); pgd_clear(pgd); } static int modify_pagetable(unsigned long start, unsigned long end, bool add, - bool direct) + bool direct, struct vmem_altmap *altmap) { unsigned long addr, next; int ret = -ENOMEM; @@ -445,7 +451,7 @@ static int modify_pagetable(unsigned long start, unsigned long end, bool add, goto out; pgd_populate(&init_mm, pgd, p4d); } - ret = modify_p4d_table(pgd, addr, next, add, direct); + ret = modify_p4d_table(pgd, addr, next, add, direct, altmap); if (ret) goto out; if (!add) @@ -458,14 +464,16 @@ static int modify_pagetable(unsigned long start, unsigned long end, bool add, return ret; } -static int add_pagetable(unsigned long start, unsigned long end, bool direct) +static int add_pagetable(unsigned long start, unsigned long end, bool direct, + struct vmem_altmap *altmap) { - return modify_pagetable(start, end, true, direct); + return modify_pagetable(start, end, true, direct, altmap); } -static int remove_pagetable(unsigned long start, unsigned long end, bool direct) +static int remove_pagetable(unsigned long start, unsigned long end, bool direct, + struct vmem_altmap *altmap) { - return modify_pagetable(start, end, false, direct); + return modify_pagetable(start, end, false, direct, altmap); } /* @@ -474,7 +482,7 @@ static int remove_pagetable(unsigned long start, unsigned long end, bool direct) static int vmem_add_range(unsigned long start, unsigned long size) { start = (unsigned long)__va(start); - return add_pagetable(start, start + size, true); + return add_pagetable(start, start + size, true, NULL); } /* @@ -483,7 +491,7 @@ static int vmem_add_range(unsigned long start, unsigned long size) static void vmem_remove_range(unsigned long start, unsigned long size) { start = (unsigned long)__va(start); - remove_pagetable(start, start + size, true); + remove_pagetable(start, start + size, true, NULL); } /* @@ -496,9 +504,9 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, mutex_lock(&vmem_mutex); /* We don't care about the node, just use NUMA_NO_NODE on allocations */ - ret = add_pagetable(start, end, false); + ret = add_pagetable(start, end, false, altmap); if (ret) - remove_pagetable(start, end, false); + remove_pagetable(start, end, false, altmap); mutex_unlock(&vmem_mutex); return ret; } @@ -509,7 +517,7 @@ void vmemmap_free(unsigned long start, unsigned long end, struct vmem_altmap *altmap) { mutex_lock(&vmem_mutex); - remove_pagetable(start, end, false); + remove_pagetable(start, end, false, altmap); mutex_unlock(&vmem_mutex); } From 0315083eda6e6cb355b825588525a6482d266891 Mon Sep 17 00:00:00 2001 From: Sumanth Korikkar Date: Mon, 8 Jan 2024 14:27:45 +0100 Subject: [PATCH 412/707] s390/sclp: remove unhandled memory notifier type Remove memory notifier types which are unhandled by s390. Unhandled memory notifier types are covered by default case. Link: https://lkml.kernel.org/r/20240108132747.3238763-4-sumanthk@linux.ibm.com Suggested-by: Alexander Gordeev Reviewed-by: David Hildenbrand Signed-off-by: Sumanth Korikkar Cc: Aneesh Kumar K.V Cc: Anshuman Khandual Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Michal Hocko Cc: Oscar Salvador Cc: Vasily Gorbik Signed-off-by: Andrew Morton --- drivers/s390/char/sclp_cmd.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/s390/char/sclp_cmd.c b/drivers/s390/char/sclp_cmd.c index 11c428f4c7cf9c..355e63e44e9546 100644 --- a/drivers/s390/char/sclp_cmd.c +++ b/drivers/s390/char/sclp_cmd.c @@ -340,9 +340,6 @@ static int sclp_mem_notifier(struct notifier_block *nb, if (contains_standby_increment(start, start + size)) rc = -EPERM; break; - case MEM_ONLINE: - case MEM_CANCEL_OFFLINE: - break; case MEM_GOING_ONLINE: rc = sclp_mem_change_state(start, size, 1); break; From 5aab491ca6980dc38668befba71764afc3b7c2b3 Mon Sep 17 00:00:00 2001 From: Sumanth Korikkar Date: Mon, 8 Jan 2024 14:27:46 +0100 Subject: [PATCH 413/707] s390/mm: implement MEM_PREPARE_ONLINE/MEM_FINISH_OFFLINE notifiers MEM_PREPARE_ONLINE memory notifier makes memory block physical accessible via sclp assign command. The notifier ensures self-contained memory maps are accessible and hence enabling the "memmap on memory" on s390. MEM_FINISH_OFFLINE memory notifier shifts the memory block to an inaccessible state via sclp unassign command. Implementation considerations: * When MHP_MEMMAP_ON_MEMORY is disabled, the system retains the old behavior. This means the memory map is allocated from default memory. * If MACHINE_HAS_EDAT1 is unavailable, MHP_MEMMAP_ON_MEMORY is automatically disabled. This ensures that vmemmap pagetables do not consume additional memory from the default memory allocator. * The MEM_GOING_ONLINE notifier has been modified to perform no operation, as MEM_PREPARE_ONLINE already executes the sclp assign command. * The MEM_CANCEL_ONLINE/MEM_OFFLINE notifier now performs no operation, as MEM_FINISH_OFFLINE already executes the sclp unassign command. Link: https://lkml.kernel.org/r/20240108132747.3238763-5-sumanthk@linux.ibm.com Reviewed-by: Gerald Schaefer Reviewed-by: David Hildenbrand Signed-off-by: Sumanth Korikkar Cc: Alexander Gordeev Cc: Aneesh Kumar K.V Cc: Anshuman Khandual Cc: Heiko Carstens Cc: Michal Hocko Cc: Oscar Salvador Cc: Vasily Gorbik Signed-off-by: Andrew Morton --- drivers/s390/char/sclp_cmd.c | 41 ++++++++++++++++++++++++++++++------ 1 file changed, 35 insertions(+), 6 deletions(-) diff --git a/drivers/s390/char/sclp_cmd.c b/drivers/s390/char/sclp_cmd.c index 355e63e44e9546..7815e9bea69a13 100644 --- a/drivers/s390/char/sclp_cmd.c +++ b/drivers/s390/char/sclp_cmd.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -26,6 +27,7 @@ #include #include #include +#include #include "sclp.h" @@ -340,13 +342,38 @@ static int sclp_mem_notifier(struct notifier_block *nb, if (contains_standby_increment(start, start + size)) rc = -EPERM; break; - case MEM_GOING_ONLINE: + case MEM_PREPARE_ONLINE: + /* + * Access the altmap_start_pfn and altmap_nr_pages fields + * within the struct memory_notify specifically when dealing + * with only MEM_PREPARE_ONLINE/MEM_FINISH_OFFLINE notifiers. + * + * When altmap is in use, take the specified memory range + * online, which includes the altmap. + */ + if (arg->altmap_nr_pages) { + start = PFN_PHYS(arg->altmap_start_pfn); + size += PFN_PHYS(arg->altmap_nr_pages); + } rc = sclp_mem_change_state(start, size, 1); + if (rc || !arg->altmap_nr_pages) + break; + /* + * Set CMMA state to nodat here, since the struct page memory + * at the beginning of the memory block will not go through the + * buddy allocator later. + */ + __arch_set_page_nodat((void *)__va(start), arg->altmap_nr_pages); break; - case MEM_CANCEL_ONLINE: - sclp_mem_change_state(start, size, 0); - break; - case MEM_OFFLINE: + case MEM_FINISH_OFFLINE: + /* + * When altmap is in use, take the specified memory range + * offline, which includes the altmap. + */ + if (arg->altmap_nr_pages) { + start = PFN_PHYS(arg->altmap_start_pfn); + size += PFN_PHYS(arg->altmap_nr_pages); + } sclp_mem_change_state(start, size, 0); break; default: @@ -397,7 +424,9 @@ static void __init add_memory_merged(u16 rn) if (!size) goto skip_add; for (addr = start; addr < start + size; addr += block_size) - add_memory(0, addr, block_size, MHP_NONE); + add_memory(0, addr, block_size, + MACHINE_HAS_EDAT1 ? + MHP_MEMMAP_ON_MEMORY | MHP_OFFLINE_INACCESSIBLE : MHP_NONE); skip_add: first_rn = rn; num = 1; From 1b08541d5ddbfd13d3a38c0eee3804fd7e06ecff Mon Sep 17 00:00:00 2001 From: Sumanth Korikkar Date: Mon, 8 Jan 2024 14:27:47 +0100 Subject: [PATCH 414/707] s390: enable MHP_MEMMAP_ON_MEMORY Enable MHP_MEMMAP_ON_MEMORY to support "memmap on memory". memory_hotplug.memmap_on_memory=true kernel parameter should be set in kernel boot option to enable the feature. Link: https://lkml.kernel.org/r/20240108132747.3238763-6-sumanthk@linux.ibm.com Reviewed-by: Gerald Schaefer Signed-off-by: Sumanth Korikkar Cc: Alexander Gordeev Cc: Aneesh Kumar K.V Cc: Anshuman Khandual Cc: David Hildenbrand Cc: Heiko Carstens Cc: Michal Hocko Cc: Oscar Salvador Cc: Vasily Gorbik Signed-off-by: Andrew Morton --- arch/s390/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index fe565f3a3a917d..a1d6dcbc89654c 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -113,6 +113,7 @@ config S390 select ARCH_INLINE_WRITE_UNLOCK_BH select ARCH_INLINE_WRITE_UNLOCK_IRQ select ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE + select ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE select ARCH_STACKWALK select ARCH_SUPPORTS_ATOMIC_RMW select ARCH_SUPPORTS_DEBUG_PAGEALLOC From f1528ed4b3d39f947278be070bd94865c47df157 Mon Sep 17 00:00:00 2001 From: Hongbo Li Date: Mon, 8 Jan 2024 12:48:15 +0800 Subject: [PATCH 415/707] mm/filemap: avoid type conversion The return type of function folio_test_hugetlb is bool type, there is no need to assign it to an integer type. Link: https://lkml.kernel.org/r/20240108044815.3291487-1-lihongbo22@huawei.com Signed-off-by: Hongbo Li Reviewed-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/filemap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/filemap.c b/mm/filemap.c index 750e779c23db74..0d7e20edf46f59 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -843,7 +843,7 @@ noinline int __filemap_add_folio(struct address_space *mapping, struct folio *folio, pgoff_t index, gfp_t gfp, void **shadowp) { XA_STATE(xas, &mapping->i_pages, index); - int huge = folio_test_hugetlb(folio); + bool huge = folio_test_hugetlb(folio); bool charged = false; long nr = 1; From d969a80a29f23e42bdf3e0f6493bec63fe2e542c Mon Sep 17 00:00:00 2001 From: JP Kobryn Date: Fri, 5 Jan 2024 12:24:01 -0800 Subject: [PATCH 416/707] selftests/mm/ksm_functional: prevent unmapping undefined address Replace some goto statements with return statements so that unmap() is not called on an undefined address. This change is made so that unmap() can only be reached after mmap() is called (and the address mentioned is defined). Returning MAP_FAILED seems acceptable since client code checks for this value. Link: https://lkml.kernel.org/r/20240105202401.28851-1-inwardvessel@gmail.com Fixes: 42096aa24b82 ("selftest/mm: ksm_functional_tests: test in mmap_and_merge_range() if anything got merged") Signed-off-by: JP Kobryn Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/ksm_functional_tests.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/mm/ksm_functional_tests.c b/tools/testing/selftests/mm/ksm_functional_tests.c index fbff0dd09191f1..d615767e396bec 100644 --- a/tools/testing/selftests/mm/ksm_functional_tests.c +++ b/tools/testing/selftests/mm/ksm_functional_tests.c @@ -155,12 +155,12 @@ static char *mmap_and_merge_range(char val, unsigned long size, int prot, /* Stabilize accounting by disabling KSM completely. */ if (ksm_unmerge()) { ksft_test_result_fail("Disabling (unmerging) KSM failed\n"); - goto unmap; + return MAP_FAILED; } if (get_my_merging_pages() > 0) { ksft_test_result_fail("Still pages merged\n"); - goto unmap; + return MAP_FAILED; } map = mmap(NULL, size, PROT_READ|PROT_WRITE, From a87529d95e3a0372f04c5c8281fb6a81b358a7f3 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Fri, 5 Jan 2024 07:54:19 -0800 Subject: [PATCH 417/707] selftests/mm: new test that steals pages This test stresses the race between of madvise(DONTNEED), a page fault and a parallel huge page mmap, which should fail due to lack of available page available for mapping. This test case must run on a system with one and only one huge page available. # echo 1 > /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages During setup, the test allocates the only available page, and starts three threads: - thread 1: * madvise(MADV_DONTNEED) on the allocated huge page - thread 2: * Write to the allocated huge page - thread 3: * Tries to allocated (steal) an extra huge page (which is not available) thread 3 should never succeed in the allocation, since the only huge page was never unmapped, and should be reserved. Touching the old page after thread3 allocation will raise a SIGBUS. Link: https://lkml.kernel.org/r/20240105155419.1939484-2-leitao@debian.org Signed-off-by: Breno Leitao Cc: Mike Rapoport (IBM) Cc: Muchun Song Cc: Rik van Riel Cc: Shuah Khan Cc: Vegard Nossum Cc: Yang Shi Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/.gitignore | 1 + tools/testing/selftests/mm/Makefile | 1 + .../selftests/mm/hugetlb_madv_vs_map.c | 124 ++++++++++++++++++ 3 files changed, 126 insertions(+) create mode 100644 tools/testing/selftests/mm/hugetlb_madv_vs_map.c diff --git a/tools/testing/selftests/mm/.gitignore b/tools/testing/selftests/mm/.gitignore index 4ff10ea6146179..d26e962f2ac490 100644 --- a/tools/testing/selftests/mm/.gitignore +++ b/tools/testing/selftests/mm/.gitignore @@ -46,3 +46,4 @@ gup_longterm mkdirty va_high_addr_switch hugetlb_fault_after_madv +hugetlb_madv_vs_map diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile index 2453add65d12f8..990e9bb112c507 100644 --- a/tools/testing/selftests/mm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@ -70,6 +70,7 @@ TEST_GEN_FILES += ksm_tests TEST_GEN_FILES += ksm_functional_tests TEST_GEN_FILES += mdwe_test TEST_GEN_FILES += hugetlb_fault_after_madv +TEST_GEN_FILES += hugetlb_madv_vs_map ifneq ($(ARCH),arm64) TEST_GEN_FILES += soft-dirty diff --git a/tools/testing/selftests/mm/hugetlb_madv_vs_map.c b/tools/testing/selftests/mm/hugetlb_madv_vs_map.c new file mode 100644 index 00000000000000..d01e8d4901d0b5 --- /dev/null +++ b/tools/testing/selftests/mm/hugetlb_madv_vs_map.c @@ -0,0 +1,124 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * A test case that must run on a system with one and only one huge page available. + * # echo 1 > /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages + * + * During setup, the test allocates the only available page, and starts three threads: + * - thread1: + * * madvise(MADV_DONTNEED) on the allocated huge page + * - thread 2: + * * Write to the allocated huge page + * - thread 3: + * * Try to allocated an extra huge page (which must not available) + * + * The test fails if thread3 is able to allocate a page. + * + * Touching the first page after thread3's allocation will raise a SIGBUS + * + * Author: Breno Leitao + */ +#include +#include +#include +#include +#include +#include + +#include "vm_util.h" +#include "../kselftest.h" + +#define MMAP_SIZE (1 << 21) +#define INLOOP_ITER 100 + +char *huge_ptr; + +/* Touch the memory while it is being madvised() */ +void *touch(void *unused) +{ + for (int i = 0; i < INLOOP_ITER; i++) + huge_ptr[0] = '.'; + + return NULL; +} + +void *madv(void *unused) +{ + for (int i = 0; i < INLOOP_ITER; i++) + madvise(huge_ptr, MMAP_SIZE, MADV_DONTNEED); + + return NULL; +} + +/* + * We got here, and there must be no huge page available for mapping + * The other hugepage should be flipping from used <-> reserved, because + * of madvise(DONTNEED). + */ +void *map_extra(void *unused) +{ + void *ptr; + + for (int i = 0; i < INLOOP_ITER; i++) { + ptr = mmap(NULL, MMAP_SIZE, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, + -1, 0); + + if ((long)ptr != -1) { + /* Touching the other page now will cause a SIGBUG + * huge_ptr[0] = '1'; + */ + return ptr; + } + } + + return NULL; +} + +int main(void) +{ + pthread_t thread1, thread2, thread3; + unsigned long free_hugepages; + void *ret; + + /* + * On kernel 6.7, we are able to reproduce the problem with ~10 + * interactions + */ + int max = 10; + + free_hugepages = get_free_hugepages(); + + if (free_hugepages != 1) { + ksft_exit_skip("This test needs one and only one page to execute. Got %lu\n", + free_hugepages); + } + + while (max--) { + huge_ptr = mmap(NULL, MMAP_SIZE, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, + -1, 0); + + if ((unsigned long)huge_ptr == -1) { + ksft_exit_skip("Failed to allocated huge page\n"); + return KSFT_SKIP; + } + + pthread_create(&thread1, NULL, madv, NULL); + pthread_create(&thread2, NULL, touch, NULL); + pthread_create(&thread3, NULL, map_extra, NULL); + + pthread_join(thread1, NULL); + pthread_join(thread2, NULL); + pthread_join(thread3, &ret); + + if (ret) { + ksft_test_result_fail("Unexpected huge page allocation\n"); + return KSFT_FAIL; + } + + /* Unmap and restart */ + munmap(huge_ptr, MMAP_SIZE); + } + + return KSFT_PASS; +} From 52729cfd76d397b50105e89d9cfa7a6e1e9ff2cd Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Tue, 2 Jan 2024 19:46:23 +0100 Subject: [PATCH 418/707] mm: vmalloc: add va_alloc() helper Patch series "Mitigate a vmap lock contention", v3. 1. Motivation - Offload global vmap locks making it scaled to number of CPUS; - If possible and there is an agreement, we can remove the "Per cpu kva allocator" to make the vmap code to be more simple; - There were complaints from XFS folk that a vmalloc might be contented on their workloads. 2. Design(high level overview) We introduce an effective vmap node logic. A node behaves as independent entity to serve an allocation request directly(if possible) from its pool. That way it bypasses a global vmap space that is protected by its own lock. An access to pools are serialized by CPUs. Number of nodes are equal to number of CPUs in a system. Please note the high threshold is bound to 128 nodes. Pools are size segregated and populated based on system demand. The maximum alloc request that can be stored into a segregated storage is 256 pages. The lazily drain path decays a pool by 25% as a first step and as second populates it by fresh freed VAs for reuse instead of returning them into a global space. When a VA is obtained(alloc path), it is stored in separate nodes. A va->va_start address is converted into a correct node where it should be placed and resided. Doing so we balance VAs across the nodes as a result an access becomes scalable. The addr_to_node() function does a proper address conversion to a correct node. A vmap space is divided on segments with fixed size, it is 16 pages. That way any address can be associated with a segment number. Number of segments are equal to num_possible_cpus() but not grater then 128. The numeration starts from 0. See below how it is converted: static inline unsigned int addr_to_node_id(unsigned long addr) { return (addr / zone_size) % nr_nodes; } On a free path, a VA can be easily found by converting its "va_start" address to a certain node it resides. It is moved from "busy" data to "lazy" data structure. Later on, as noted earlier, the lazy kworker decays each node pool and populates it by fresh incoming VAs. Please note, a VA is returned to a node that did an alloc request. 3. Test on AMD Ryzen Threadripper 3970X 32-Core Processor sudo ./test_vmalloc.sh run_test_mask=7 nr_threads=64 94.41% 0.89% [kernel] [k] _raw_spin_lock 93.35% 93.07% [kernel] [k] native_queued_spin_lock_slowpath 76.13% 0.28% [kernel] [k] __vmalloc_node_range 72.96% 0.81% [kernel] [k] alloc_vmap_area 56.94% 0.00% [kernel] [k] __get_vm_area_node 41.95% 0.00% [kernel] [k] vmalloc 37.15% 0.01% [test_vmalloc] [k] full_fit_alloc_test 35.17% 0.00% [kernel] [k] ret_from_fork_asm 35.17% 0.00% [kernel] [k] ret_from_fork 35.17% 0.00% [kernel] [k] kthread 35.08% 0.00% [test_vmalloc] [k] test_func 34.45% 0.00% [test_vmalloc] [k] fix_size_alloc_test 28.09% 0.01% [test_vmalloc] [k] long_busy_list_alloc_test 23.53% 0.25% [kernel] [k] vfree.part.0 21.72% 0.00% [kernel] [k] remove_vm_area 20.08% 0.21% [kernel] [k] find_unlink_vmap_area 2.34% 0.61% [kernel] [k] free_vmap_area_noflush vs 82.32% 0.22% [test_vmalloc] [k] long_busy_list_alloc_test 63.36% 0.02% [kernel] [k] vmalloc 63.34% 2.64% [kernel] [k] __vmalloc_node_range 30.42% 4.46% [kernel] [k] vfree.part.0 28.98% 2.51% [kernel] [k] __alloc_pages_bulk 27.28% 0.19% [kernel] [k] __get_vm_area_node 26.13% 1.50% [kernel] [k] alloc_vmap_area 21.72% 21.67% [kernel] [k] clear_page_rep 19.51% 2.43% [kernel] [k] _raw_spin_lock 16.61% 16.51% [kernel] [k] native_queued_spin_lock_slowpath 13.40% 2.07% [kernel] [k] free_unref_page 10.62% 0.01% [kernel] [k] remove_vm_area 9.02% 8.73% [kernel] [k] insert_vmap_area 8.94% 0.00% [kernel] [k] ret_from_fork_asm 8.94% 0.00% [kernel] [k] ret_from_fork 8.94% 0.00% [kernel] [k] kthread 8.29% 0.00% [test_vmalloc] [k] test_func 7.81% 0.05% [test_vmalloc] [k] full_fit_alloc_test 5.30% 4.73% [kernel] [k] purge_vmap_node 4.47% 2.65% [kernel] [k] free_vmap_area_noflush confirms that a native_queued_spin_lock_slowpath goes down to 16.51% percent from 93.07%. The throughput is ~12x higher: urezki@pc638:~$ time sudo ./test_vmalloc.sh run_test_mask=7 nr_threads=64 Run the test with following parameters: run_test_mask=7 nr_threads=64 Done. Check the kernel ring buffer to see the summary. real 10m51.271s user 0m0.013s sys 0m0.187s urezki@pc638:~$ urezki@pc638:~$ time sudo ./test_vmalloc.sh run_test_mask=7 nr_threads=64 Run the test with following parameters: run_test_mask=7 nr_threads=64 Done. Check the kernel ring buffer to see the summary. real 0m51.301s user 0m0.015s sys 0m0.040s urezki@pc638:~$ This patch (of 11): Currently __alloc_vmap_area() function contains an open codded logic that finds and adjusts a VA based on allocation request. Introduce a va_alloc() helper that adjusts found VA only. There is no a functional change as a result of this patch. Link: https://lkml.kernel.org/r/20240102184633.748113-1-urezki@gmail.com Link: https://lkml.kernel.org/r/20240102184633.748113-2-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Reviewed-by: Baoquan He Reviewed-by: Christoph Hellwig Reviewed-by: Lorenzo Stoakes Cc: Dave Chinner Cc: Joel Fernandes (Google) Cc: Liam R. Howlett Cc: Matthew Wilcox (Oracle) Cc: Oleksiy Avramchenko Cc: Paul E. McKenney Cc: Kazuhito Hagio Signed-off-by: Andrew Morton --- mm/vmalloc.c | 41 ++++++++++++++++++++++++++++------------- 1 file changed, 28 insertions(+), 13 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index d12a17fc0c171c..739401a9eafcfe 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1481,6 +1481,32 @@ adjust_va_to_fit_type(struct rb_root *root, struct list_head *head, return 0; } +static unsigned long +va_alloc(struct vmap_area *va, + struct rb_root *root, struct list_head *head, + unsigned long size, unsigned long align, + unsigned long vstart, unsigned long vend) +{ + unsigned long nva_start_addr; + int ret; + + if (va->va_start > vstart) + nva_start_addr = ALIGN(va->va_start, align); + else + nva_start_addr = ALIGN(vstart, align); + + /* Check the "vend" restriction. */ + if (nva_start_addr + size > vend) + return vend; + + /* Update the free vmap_area. */ + ret = adjust_va_to_fit_type(root, head, va, nva_start_addr, size); + if (WARN_ON_ONCE(ret)) + return vend; + + return nva_start_addr; +} + /* * Returns a start address of the newly allocated area, if success. * Otherwise a vend is returned that indicates failure. @@ -1493,7 +1519,6 @@ __alloc_vmap_area(struct rb_root *root, struct list_head *head, bool adjust_search_size = true; unsigned long nva_start_addr; struct vmap_area *va; - int ret; /* * Do not adjust when: @@ -1511,18 +1536,8 @@ __alloc_vmap_area(struct rb_root *root, struct list_head *head, if (unlikely(!va)) return vend; - if (va->va_start > vstart) - nva_start_addr = ALIGN(va->va_start, align); - else - nva_start_addr = ALIGN(vstart, align); - - /* Check the "vend" restriction. */ - if (nva_start_addr + size > vend) - return vend; - - /* Update the free vmap_area. */ - ret = adjust_va_to_fit_type(root, head, va, nva_start_addr, size); - if (WARN_ON_ONCE(ret)) + nva_start_addr = va_alloc(va, root, head, size, align, vstart, vend); + if (nva_start_addr == vend) return vend; #if DEBUG_AUGMENT_LOWEST_MATCH_CHECK From 169a5a6ddde9a36782c5738e108c858b54f5511b Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Tue, 2 Jan 2024 19:46:24 +0100 Subject: [PATCH 419/707] mm: vmalloc: rename adjust_va_to_fit_type() function This patch renames the adjust_va_to_fit_type() function to va_clip() which is shorter and more expressive. There is no a functional change as a result of this patch. Link: https://lkml.kernel.org/r/20240102184633.748113-3-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Reviewed-by: Baoquan He Reviewed-by: Christoph Hellwig Reviewed-by: Lorenzo Stoakes Cc: Dave Chinner Cc: Joel Fernandes (Google) Cc: Kazuhito Hagio Cc: Liam R. Howlett Cc: Matthew Wilcox (Oracle) Cc: Oleksiy Avramchenko Cc: Paul E. McKenney Signed-off-by: Andrew Morton --- mm/vmalloc.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 739401a9eafcfe..10f289e865122a 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1382,9 +1382,9 @@ classify_va_fit_type(struct vmap_area *va, } static __always_inline int -adjust_va_to_fit_type(struct rb_root *root, struct list_head *head, - struct vmap_area *va, unsigned long nva_start_addr, - unsigned long size) +va_clip(struct rb_root *root, struct list_head *head, + struct vmap_area *va, unsigned long nva_start_addr, + unsigned long size) { struct vmap_area *lva = NULL; enum fit_type type = classify_va_fit_type(va, nva_start_addr, size); @@ -1500,7 +1500,7 @@ va_alloc(struct vmap_area *va, return vend; /* Update the free vmap_area. */ - ret = adjust_va_to_fit_type(root, head, va, nva_start_addr, size); + ret = va_clip(root, head, va, nva_start_addr, size); if (WARN_ON_ONCE(ret)) return vend; @@ -4155,9 +4155,8 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, /* It is a BUG(), but trigger recovery instead. */ goto recovery; - ret = adjust_va_to_fit_type(&free_vmap_area_root, - &free_vmap_area_list, - va, start, size); + ret = va_clip(&free_vmap_area_root, + &free_vmap_area_list, va, start, size); if (WARN_ON_ONCE(unlikely(ret))) /* It is a BUG(), but trigger recovery instead. */ goto recovery; From 51d4a5c59481a6edb5caa52953f38f686e8f893c Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Tue, 2 Jan 2024 19:46:25 +0100 Subject: [PATCH 420/707] mm: vmalloc: move vmap_init_free_space() down in vmalloc.c A vmap_init_free_space() is a function that setups a vmap space and is considered as part of initialization phase. Since a main entry which is vmalloc_init(), has been moved down in vmalloc.c it makes sense to follow the pattern. There is no a functional change as a result of this patch. Link: https://lkml.kernel.org/r/20240102184633.748113-4-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Reviewed-by: Baoquan He Reviewed-by: Christoph Hellwig Reviewed-by: Lorenzo Stoakes Cc: Dave Chinner Cc: Joel Fernandes (Google) Cc: Kazuhito Hagio Cc: Liam R. Howlett Cc: Matthew Wilcox (Oracle) Cc: Oleksiy Avramchenko Cc: Paul E. McKenney Signed-off-by: Andrew Morton --- mm/vmalloc.c | 82 ++++++++++++++++++++++++++-------------------------- 1 file changed, 41 insertions(+), 41 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 10f289e865122a..06bd843d18ae99 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2512,47 +2512,6 @@ void __init vm_area_register_early(struct vm_struct *vm, size_t align) kasan_populate_early_vm_area_shadow(vm->addr, vm->size); } -static void vmap_init_free_space(void) -{ - unsigned long vmap_start = 1; - const unsigned long vmap_end = ULONG_MAX; - struct vmap_area *busy, *free; - - /* - * B F B B B F - * -|-----|.....|-----|-----|-----|.....|- - * | The KVA space | - * |<--------------------------------->| - */ - list_for_each_entry(busy, &vmap_area_list, list) { - if (busy->va_start - vmap_start > 0) { - free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); - if (!WARN_ON_ONCE(!free)) { - free->va_start = vmap_start; - free->va_end = busy->va_start; - - insert_vmap_area_augment(free, NULL, - &free_vmap_area_root, - &free_vmap_area_list); - } - } - - vmap_start = busy->va_end; - } - - if (vmap_end - vmap_start > 0) { - free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); - if (!WARN_ON_ONCE(!free)) { - free->va_start = vmap_start; - free->va_end = vmap_end; - - insert_vmap_area_augment(free, NULL, - &free_vmap_area_root, - &free_vmap_area_list); - } - } -} - static inline void setup_vmalloc_vm_locked(struct vm_struct *vm, struct vmap_area *va, unsigned long flags, const void *caller) { @@ -4465,6 +4424,47 @@ module_init(proc_vmalloc_init); #endif +static void vmap_init_free_space(void) +{ + unsigned long vmap_start = 1; + const unsigned long vmap_end = ULONG_MAX; + struct vmap_area *busy, *free; + + /* + * B F B B B F + * -|-----|.....|-----|-----|-----|.....|- + * | The KVA space | + * |<--------------------------------->| + */ + list_for_each_entry(busy, &vmap_area_list, list) { + if (busy->va_start - vmap_start > 0) { + free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); + if (!WARN_ON_ONCE(!free)) { + free->va_start = vmap_start; + free->va_end = busy->va_start; + + insert_vmap_area_augment(free, NULL, + &free_vmap_area_root, + &free_vmap_area_list); + } + } + + vmap_start = busy->va_end; + } + + if (vmap_end - vmap_start > 0) { + free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); + if (!WARN_ON_ONCE(!free)) { + free->va_start = vmap_start; + free->va_end = vmap_end; + + insert_vmap_area_augment(free, NULL, + &free_vmap_area_root, + &free_vmap_area_list); + } + } +} + void __init vmalloc_init(void) { struct vmap_area *va; From aac1abf328a2616ad925eeb326d2c6691c628102 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Tue, 2 Jan 2024 19:46:26 +0100 Subject: [PATCH 421/707] mm: vmalloc: remove global vmap_area_root rb-tree Store allocated objects in a separate nodes. A va->va_start address is converted into a correct node where it should be placed and resided. An addr_to_node() function is used to do a proper address conversion to determine a node that contains a VA. Such approach balances VAs across nodes as a result an access becomes scalable. Number of nodes in a system depends on number of CPUs. Please note: 1. As of now allocated VAs are bound to a node-0. It means the patch does not give any difference comparing with a current behavior; 2. The global vmap_area_lock, vmap_area_root are removed as there is no need in it anymore. The vmap_area_list is still kept and is _empty_. It is exported for a kexec only; 3. The vmallocinfo and vread() have to be reworked to be able to handle multiple nodes. Link: https://lkml.kernel.org/r/20240102184633.748113-5-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Reviewed-by: Baoquan He Cc: Christoph Hellwig Cc: Dave Chinner Cc: Joel Fernandes (Google) Cc: Kazuhito Hagio Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Oleksiy Avramchenko Cc: Paul E. McKenney Signed-off-by: Andrew Morton --- mm/vmalloc.c | 240 +++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 173 insertions(+), 67 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 06bd843d18ae99..786ecb18ae228b 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -728,11 +728,9 @@ EXPORT_SYMBOL(vmalloc_to_pfn); #define DEBUG_AUGMENT_LOWEST_MATCH_CHECK 0 -static DEFINE_SPINLOCK(vmap_area_lock); static DEFINE_SPINLOCK(free_vmap_area_lock); /* Export for kexec only */ LIST_HEAD(vmap_area_list); -static struct rb_root vmap_area_root = RB_ROOT; static bool vmap_initialized __read_mostly; static struct rb_root purge_vmap_area_root = RB_ROOT; @@ -772,6 +770,38 @@ static struct rb_root free_vmap_area_root = RB_ROOT; */ static DEFINE_PER_CPU(struct vmap_area *, ne_fit_preload_node); +/* + * An effective vmap-node logic. Users make use of nodes instead + * of a global heap. It allows to balance an access and mitigate + * contention. + */ +struct rb_list { + struct rb_root root; + struct list_head head; + spinlock_t lock; +}; + +static struct vmap_node { + /* Bookkeeping data of this node. */ + struct rb_list busy; +} single; + +static struct vmap_node *vmap_nodes = &single; +static __read_mostly unsigned int nr_vmap_nodes = 1; +static __read_mostly unsigned int vmap_zone_size = 1; + +static inline unsigned int +addr_to_node_id(unsigned long addr) +{ + return (addr / vmap_zone_size) % nr_vmap_nodes; +} + +static inline struct vmap_node * +addr_to_node(unsigned long addr) +{ + return &vmap_nodes[addr_to_node_id(addr)]; +} + static __always_inline unsigned long va_size(struct vmap_area *va) { @@ -803,10 +833,11 @@ unsigned long vmalloc_nr_pages(void) } /* Look up the first VA which satisfies addr < va_end, NULL if none. */ -static struct vmap_area *find_vmap_area_exceed_addr(unsigned long addr) +static struct vmap_area * +find_vmap_area_exceed_addr(unsigned long addr, struct rb_root *root) { struct vmap_area *va = NULL; - struct rb_node *n = vmap_area_root.rb_node; + struct rb_node *n = root->rb_node; addr = (unsigned long)kasan_reset_tag((void *)addr); @@ -1552,12 +1583,14 @@ __alloc_vmap_area(struct rb_root *root, struct list_head *head, */ static void free_vmap_area(struct vmap_area *va) { + struct vmap_node *vn = addr_to_node(va->va_start); + /* * Remove from the busy tree/list. */ - spin_lock(&vmap_area_lock); - unlink_va(va, &vmap_area_root); - spin_unlock(&vmap_area_lock); + spin_lock(&vn->busy.lock); + unlink_va(va, &vn->busy.root); + spin_unlock(&vn->busy.lock); /* * Insert/Merge it back to the free tree/list. @@ -1600,6 +1633,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, int node, gfp_t gfp_mask, unsigned long va_flags) { + struct vmap_node *vn; struct vmap_area *va; unsigned long freed; unsigned long addr; @@ -1645,9 +1679,11 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, va->vm = NULL; va->flags = va_flags; - spin_lock(&vmap_area_lock); - insert_vmap_area(va, &vmap_area_root, &vmap_area_list); - spin_unlock(&vmap_area_lock); + vn = addr_to_node(va->va_start); + + spin_lock(&vn->busy.lock); + insert_vmap_area(va, &vn->busy.root, &vn->busy.head); + spin_unlock(&vn->busy.lock); BUG_ON(!IS_ALIGNED(va->va_start, align)); BUG_ON(va->va_start < vstart); @@ -1871,26 +1907,61 @@ static void free_unmap_vmap_area(struct vmap_area *va) struct vmap_area *find_vmap_area(unsigned long addr) { + struct vmap_node *vn; struct vmap_area *va; + int i, j; - spin_lock(&vmap_area_lock); - va = __find_vmap_area(addr, &vmap_area_root); - spin_unlock(&vmap_area_lock); + /* + * An addr_to_node_id(addr) converts an address to a node index + * where a VA is located. If VA spans several zones and passed + * addr is not the same as va->va_start, what is not common, we + * may need to scan an extra nodes. See an example: + * + * <--va--> + * -|-----|-----|-----|-----|- + * 1 2 0 1 + * + * VA resides in node 1 whereas it spans 1 and 2. If passed + * addr is within a second node we should do extra work. We + * should mention that it is rare and is a corner case from + * the other hand it has to be covered. + */ + i = j = addr_to_node_id(addr); + do { + vn = &vmap_nodes[i]; - return va; + spin_lock(&vn->busy.lock); + va = __find_vmap_area(addr, &vn->busy.root); + spin_unlock(&vn->busy.lock); + + if (va) + return va; + } while ((i = (i + 1) % nr_vmap_nodes) != j); + + return NULL; } static struct vmap_area *find_unlink_vmap_area(unsigned long addr) { + struct vmap_node *vn; struct vmap_area *va; + int i, j; - spin_lock(&vmap_area_lock); - va = __find_vmap_area(addr, &vmap_area_root); - if (va) - unlink_va(va, &vmap_area_root); - spin_unlock(&vmap_area_lock); + i = j = addr_to_node_id(addr); + do { + vn = &vmap_nodes[i]; - return va; + spin_lock(&vn->busy.lock); + va = __find_vmap_area(addr, &vn->busy.root); + if (va) + unlink_va(va, &vn->busy.root); + spin_unlock(&vn->busy.lock); + + if (va) + return va; + } while ((i = (i + 1) % nr_vmap_nodes) != j); + + return NULL; } /*** Per cpu kva allocator ***/ @@ -2092,6 +2163,7 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask) static void free_vmap_block(struct vmap_block *vb) { + struct vmap_node *vn; struct vmap_block *tmp; struct xarray *xa; @@ -2099,9 +2171,10 @@ static void free_vmap_block(struct vmap_block *vb) tmp = xa_erase(xa, addr_to_vb_idx(vb->va->va_start)); BUG_ON(tmp != vb); - spin_lock(&vmap_area_lock); - unlink_va(vb->va, &vmap_area_root); - spin_unlock(&vmap_area_lock); + vn = addr_to_node(vb->va->va_start); + spin_lock(&vn->busy.lock); + unlink_va(vb->va, &vn->busy.root); + spin_unlock(&vn->busy.lock); free_vmap_area_noflush(vb->va); kfree_rcu(vb, rcu_head); @@ -2525,9 +2598,11 @@ static inline void setup_vmalloc_vm_locked(struct vm_struct *vm, static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, unsigned long flags, const void *caller) { - spin_lock(&vmap_area_lock); + struct vmap_node *vn = addr_to_node(va->va_start); + + spin_lock(&vn->busy.lock); setup_vmalloc_vm_locked(vm, va, flags, caller); - spin_unlock(&vmap_area_lock); + spin_unlock(&vn->busy.lock); } static void clear_vm_uninitialized_flag(struct vm_struct *vm) @@ -3715,6 +3790,7 @@ static size_t vmap_ram_vread_iter(struct iov_iter *iter, const char *addr, */ long vread_iter(struct iov_iter *iter, const char *addr, size_t count) { + struct vmap_node *vn; struct vmap_area *va; struct vm_struct *vm; char *vaddr; @@ -3728,8 +3804,11 @@ long vread_iter(struct iov_iter *iter, const char *addr, size_t count) remains = count; - spin_lock(&vmap_area_lock); - va = find_vmap_area_exceed_addr((unsigned long)addr); + /* Hooked to node_0 so far. */ + vn = addr_to_node(0); + spin_lock(&vn->busy.lock); + + va = find_vmap_area_exceed_addr((unsigned long)addr, &vn->busy.root); if (!va) goto finished_zero; @@ -3737,7 +3816,7 @@ long vread_iter(struct iov_iter *iter, const char *addr, size_t count) if ((unsigned long)addr + remains <= va->va_start) goto finished_zero; - list_for_each_entry_from(va, &vmap_area_list, list) { + list_for_each_entry_from(va, &vn->busy.head, list) { size_t copied; if (remains == 0) @@ -3796,12 +3875,12 @@ long vread_iter(struct iov_iter *iter, const char *addr, size_t count) } finished_zero: - spin_unlock(&vmap_area_lock); + spin_unlock(&vn->busy.lock); /* zero-fill memory holes */ return count - remains + zero_iter(iter, remains); finished: /* Nothing remains, or We couldn't copy/zero everything. */ - spin_unlock(&vmap_area_lock); + spin_unlock(&vn->busy.lock); return count - remains; } @@ -4135,14 +4214,15 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, } /* insert all vm's */ - spin_lock(&vmap_area_lock); for (area = 0; area < nr_vms; area++) { - insert_vmap_area(vas[area], &vmap_area_root, &vmap_area_list); + struct vmap_node *vn = addr_to_node(vas[area]->va_start); + spin_lock(&vn->busy.lock); + insert_vmap_area(vas[area], &vn->busy.root, &vn->busy.head); setup_vmalloc_vm_locked(vms[area], vas[area], VM_ALLOC, pcpu_get_vm_areas); + spin_unlock(&vn->busy.lock); } - spin_unlock(&vmap_area_lock); /* * Mark allocated areas as accessible. Do it now as a best-effort @@ -4253,55 +4333,57 @@ bool vmalloc_dump_obj(void *object) { void *objp = (void *)PAGE_ALIGN((unsigned long)object); const void *caller; - struct vm_struct *vm; struct vmap_area *va; + struct vmap_node *vn; unsigned long addr; unsigned int nr_pages; + bool success = false; - if (!spin_trylock(&vmap_area_lock)) - return false; - va = __find_vmap_area((unsigned long)objp, &vmap_area_root); - if (!va) { - spin_unlock(&vmap_area_lock); - return false; - } + vn = addr_to_node((unsigned long)objp); - vm = va->vm; - if (!vm) { - spin_unlock(&vmap_area_lock); - return false; + if (spin_trylock(&vn->busy.lock)) { + va = __find_vmap_area(addr, &vn->busy.root); + + if (va && va->vm) { + addr = (unsigned long)va->vm->addr; + caller = va->vm->caller; + nr_pages = va->vm->nr_pages; + success = true; + } + + spin_unlock(&vn->busy.lock); } - addr = (unsigned long)vm->addr; - caller = vm->caller; - nr_pages = vm->nr_pages; - spin_unlock(&vmap_area_lock); - pr_cont(" %u-page vmalloc region starting at %#lx allocated at %pS\n", - nr_pages, addr, caller); - return true; + + if (success) + pr_cont(" %u-page vmalloc region starting at %#lx allocated at %pS\n", + nr_pages, addr, caller); + + return success; } #endif #ifdef CONFIG_PROC_FS static void *s_start(struct seq_file *m, loff_t *pos) - __acquires(&vmap_purge_lock) - __acquires(&vmap_area_lock) { + struct vmap_node *vn = addr_to_node(0); + mutex_lock(&vmap_purge_lock); - spin_lock(&vmap_area_lock); + spin_lock(&vn->busy.lock); - return seq_list_start(&vmap_area_list, *pos); + return seq_list_start(&vn->busy.head, *pos); } static void *s_next(struct seq_file *m, void *p, loff_t *pos) { - return seq_list_next(p, &vmap_area_list, pos); + struct vmap_node *vn = addr_to_node(0); + return seq_list_next(p, &vn->busy.head, pos); } static void s_stop(struct seq_file *m, void *p) - __releases(&vmap_area_lock) - __releases(&vmap_purge_lock) { - spin_unlock(&vmap_area_lock); + struct vmap_node *vn = addr_to_node(0); + + spin_unlock(&vn->busy.lock); mutex_unlock(&vmap_purge_lock); } @@ -4344,9 +4426,11 @@ static void show_purge_info(struct seq_file *m) static int s_show(struct seq_file *m, void *p) { + struct vmap_node *vn; struct vmap_area *va; struct vm_struct *v; + vn = addr_to_node(0); va = list_entry(p, struct vmap_area, list); if (!va->vm) { @@ -4397,7 +4481,7 @@ static int s_show(struct seq_file *m, void *p) * As a final step, dump "unpurged" areas. */ final: - if (list_is_last(&va->list, &vmap_area_list)) + if (list_is_last(&va->list, &vn->busy.head)) show_purge_info(m); return 0; @@ -4428,7 +4512,8 @@ static void vmap_init_free_space(void) { unsigned long vmap_start = 1; const unsigned long vmap_end = ULONG_MAX; - struct vmap_area *busy, *free; + struct vmap_area *free; + struct vm_struct *busy; /* * B F B B B F @@ -4436,12 +4521,12 @@ static void vmap_init_free_space(void) * | The KVA space | * |<--------------------------------->| */ - list_for_each_entry(busy, &vmap_area_list, list) { - if (busy->va_start - vmap_start > 0) { + for (busy = vmlist; busy; busy = busy->next) { + if ((unsigned long) busy->addr - vmap_start > 0) { free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); if (!WARN_ON_ONCE(!free)) { free->va_start = vmap_start; - free->va_end = busy->va_start; + free->va_end = (unsigned long) busy->addr; insert_vmap_area_augment(free, NULL, &free_vmap_area_root, @@ -4449,7 +4534,7 @@ static void vmap_init_free_space(void) } } - vmap_start = busy->va_end; + vmap_start = (unsigned long) busy->addr + busy->size; } if (vmap_end - vmap_start > 0) { @@ -4465,9 +4550,23 @@ static void vmap_init_free_space(void) } } +static void vmap_init_nodes(void) +{ + struct vmap_node *vn; + int i; + + for (i = 0; i < nr_vmap_nodes; i++) { + vn = &vmap_nodes[i]; + vn->busy.root = RB_ROOT; + INIT_LIST_HEAD(&vn->busy.head); + spin_lock_init(&vn->busy.lock); + } +} + void __init vmalloc_init(void) { struct vmap_area *va; + struct vmap_node *vn; struct vm_struct *tmp; int i; @@ -4489,6 +4588,11 @@ void __init vmalloc_init(void) xa_init(&vbq->vmap_blocks); } + /* + * Setup nodes before importing vmlist. + */ + vmap_init_nodes(); + /* Import existing vmlist entries. */ for (tmp = vmlist; tmp; tmp = tmp->next) { va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); @@ -4498,7 +4602,9 @@ void __init vmalloc_init(void) va->va_start = (unsigned long)tmp->addr; va->va_end = va->va_start + tmp->size; va->vm = tmp; - insert_vmap_area(va, &vmap_area_root, &vmap_area_list); + + vn = addr_to_node(va->va_start); + insert_vmap_area(va, &vn->busy.root, &vn->busy.head); } /* From 2d6b6cc380b9a922b651e5c93a98ef4f68c0b429 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Thu, 11 Jan 2024 14:26:28 +0100 Subject: [PATCH 422/707] mm: vmalloc: mark vmap_init_free_space() with __init tag vmap_init_free_space() is called only once therefore tag it with __init. Apart of that it access the "vmlist" variable that is located in ".init.data" section. Link: https://lkml.kernel.org/r/20240111132628.299644-1-urezki@gmail.com Fixes: 86817057732a ("mm: vmalloc: remove global vmap_area_root rb-tree") Signed-off-by: Uladzislau Rezki (Sony) Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202401112056.I41bELL4-lkp@intel.com/ Reviewed-by: Christoph Hellwig Reviewed-by: Lorenzo Stoakes Reviewed-by: Anshuman Khandual Cc: Baoquan He Cc: Dave Chinner Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Oleksiy Avramchenko Signed-off-by: Andrew Morton --- mm/vmalloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 786ecb18ae228b..666ea8a379f6bb 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -4508,7 +4508,7 @@ module_init(proc_vmalloc_init); #endif -static void vmap_init_free_space(void) +static void __init vmap_init_free_space(void) { unsigned long vmap_start = 1; const unsigned long vmap_end = ULONG_MAX; From 60eb705ddffe5673b9fb819338aa50ee779b91c4 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Thu, 11 Jan 2024 13:11:04 +0100 Subject: [PATCH 423/707] fix a wrong value passed to __find_vmap_area() There was a typo in the vmalloc_dump_obj() function. Instead of passing a real address which is "objp" an "addr" was used what is wrong and not initialized. Link: https://lkml.kernel.org/r/20240111121104.180993-1-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Reported-by: kernel test robot Fixes: 86817057732a ("mm: vmalloc: remove global vmap_area_root rb-tree") Closes: https://lore.kernel.org/oe-kbuild-all/202401111810.TKPIXLCs-lkp@intel.com/ Cc: Baoquan He Cc: Christoph Hellwig Cc: Dave Chinner Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Nathan Chancellor Cc: Oleksiy Avramchenko Signed-off-by: Andrew Morton --- mm/vmalloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 666ea8a379f6bb..86efebf0e0c8a7 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -4342,7 +4342,7 @@ bool vmalloc_dump_obj(void *object) vn = addr_to_node((unsigned long)objp); if (spin_trylock(&vn->busy.lock)) { - va = __find_vmap_area(addr, &vn->busy.root); + va = __find_vmap_area((unsigned long)objp, &vn->busy.root); if (va && va->vm) { addr = (unsigned long)va->vm->addr; From 697948bd0325e89b3d31d2074bcdcd3868c137a1 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Tue, 2 Jan 2024 19:46:27 +0100 Subject: [PATCH 424/707] mm/vmalloc: remove vmap_area_list Earlier, vmap_area_list is exported to vmcoreinfo so that makedumpfile get the base address of vmalloc area. Now, vmap_area_list is empty, so export VMALLOC_START to vmcoreinfo instead, and remove vmap_area_list. Link: https://lkml.kernel.org/r/20240102184633.748113-6-urezki@gmail.com Signed-off-by: Baoquan He Signed-off-by: Uladzislau Rezki (Sony) Acked-by: Lorenzo Stoakes Cc: Christoph Hellwig Cc: Dave Chinner Cc: Joel Fernandes (Google) Cc: Kazuhito Hagio Cc: Liam R. Howlett Cc: Matthew Wilcox (Oracle) Cc: Oleksiy Avramchenko Cc: Paul E. McKenney Signed-off-by: Andrew Morton --- Documentation/admin-guide/kdump/vmcoreinfo.rst | 8 ++++---- arch/arm64/kernel/crash_core.c | 1 - arch/riscv/kernel/crash_core.c | 1 - include/linux/vmalloc.h | 1 - kernel/crash_core.c | 4 +--- kernel/kallsyms_selftest.c | 1 - mm/nommu.c | 2 -- mm/vmalloc.c | 2 -- 8 files changed, 5 insertions(+), 15 deletions(-) diff --git a/Documentation/admin-guide/kdump/vmcoreinfo.rst b/Documentation/admin-guide/kdump/vmcoreinfo.rst index bced9e4b6e0899..0f714fc945acf4 100644 --- a/Documentation/admin-guide/kdump/vmcoreinfo.rst +++ b/Documentation/admin-guide/kdump/vmcoreinfo.rst @@ -65,11 +65,11 @@ Defines the beginning of the text section. In general, _stext indicates the kernel start address. Used to convert a virtual address from the direct kernel map to a physical address. -vmap_area_list --------------- +VMALLOC_START +------------- -Stores the virtual area list. makedumpfile gets the vmalloc start value -from this variable and its value is necessary for vmalloc translation. +Stores the base address of vmalloc area. makedumpfile gets this value +since is necessary for vmalloc translation. mem_map ------- diff --git a/arch/arm64/kernel/crash_core.c b/arch/arm64/kernel/crash_core.c index 66cde752cd7409..2a24199a9b81e0 100644 --- a/arch/arm64/kernel/crash_core.c +++ b/arch/arm64/kernel/crash_core.c @@ -23,7 +23,6 @@ void arch_crash_save_vmcoreinfo(void) /* Please note VMCOREINFO_NUMBER() uses "%d", not "%x" */ vmcoreinfo_append_str("NUMBER(MODULES_VADDR)=0x%lx\n", MODULES_VADDR); vmcoreinfo_append_str("NUMBER(MODULES_END)=0x%lx\n", MODULES_END); - vmcoreinfo_append_str("NUMBER(VMALLOC_START)=0x%lx\n", VMALLOC_START); vmcoreinfo_append_str("NUMBER(VMALLOC_END)=0x%lx\n", VMALLOC_END); vmcoreinfo_append_str("NUMBER(VMEMMAP_START)=0x%lx\n", VMEMMAP_START); vmcoreinfo_append_str("NUMBER(VMEMMAP_END)=0x%lx\n", VMEMMAP_END); diff --git a/arch/riscv/kernel/crash_core.c b/arch/riscv/kernel/crash_core.c index 8706736fd4e2dc..d18d529fd9b984 100644 --- a/arch/riscv/kernel/crash_core.c +++ b/arch/riscv/kernel/crash_core.c @@ -8,7 +8,6 @@ void arch_crash_save_vmcoreinfo(void) VMCOREINFO_NUMBER(phys_ram_base); vmcoreinfo_append_str("NUMBER(PAGE_OFFSET)=0x%lx\n", PAGE_OFFSET); - vmcoreinfo_append_str("NUMBER(VMALLOC_START)=0x%lx\n", VMALLOC_START); vmcoreinfo_append_str("NUMBER(VMALLOC_END)=0x%lx\n", VMALLOC_END); #ifdef CONFIG_MMU VMCOREINFO_NUMBER(VA_BITS); diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index c720be70c8ddde..91810b4e95107b 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -253,7 +253,6 @@ extern long vread_iter(struct iov_iter *iter, const char *addr, size_t count); /* * Internals. Don't use.. */ -extern struct list_head vmap_area_list; extern __init void vm_area_add_early(struct vm_struct *vm); extern __init void vm_area_register_early(struct vm_struct *vm, size_t align); diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 75cd6a736d0306..b60de490c1fccb 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -748,7 +748,7 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_SYMBOL_ARRAY(swapper_pg_dir); #endif VMCOREINFO_SYMBOL(_stext); - VMCOREINFO_SYMBOL(vmap_area_list); + vmcoreinfo_append_str("NUMBER(VMALLOC_START)=0x%lx\n", VMALLOC_START); #ifndef CONFIG_NUMA VMCOREINFO_SYMBOL(mem_map); @@ -789,8 +789,6 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_OFFSET(free_area, free_list); VMCOREINFO_OFFSET(list_head, next); VMCOREINFO_OFFSET(list_head, prev); - VMCOREINFO_OFFSET(vmap_area, va_start); - VMCOREINFO_OFFSET(vmap_area, list); VMCOREINFO_LENGTH(zone.free_area, NR_PAGE_ORDERS); log_buf_vmcoreinfo_setup(); VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES); diff --git a/kernel/kallsyms_selftest.c b/kernel/kallsyms_selftest.c index b4cac76ea5e989..8a689b4ff4f982 100644 --- a/kernel/kallsyms_selftest.c +++ b/kernel/kallsyms_selftest.c @@ -89,7 +89,6 @@ static struct test_item test_items[] = { ITEM_DATA(kallsyms_test_var_data_static), ITEM_DATA(kallsyms_test_var_bss), ITEM_DATA(kallsyms_test_var_data), - ITEM_DATA(vmap_area_list), #endif }; diff --git a/mm/nommu.c b/mm/nommu.c index b6dc558d314408..5ec8f44e7ce976 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -131,8 +131,6 @@ int follow_pfn(struct vm_area_struct *vma, unsigned long address, } EXPORT_SYMBOL(follow_pfn); -LIST_HEAD(vmap_area_list); - void vfree(const void *addr) { kfree(addr); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 86efebf0e0c8a7..b5882790da0088 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -729,8 +729,6 @@ EXPORT_SYMBOL(vmalloc_to_pfn); static DEFINE_SPINLOCK(free_vmap_area_lock); -/* Export for kexec only */ -LIST_HEAD(vmap_area_list); static bool vmap_initialized __read_mostly; static struct rb_root purge_vmap_area_root = RB_ROOT; From 5c39e51e42185e7f4a5bf2c65ad5886202fafb30 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Thu, 11 Jan 2024 20:23:29 +0100 Subject: [PATCH 425/707] mm: vmalloc: Fix a warning in the crash_save_vmcoreinfo_init() The vmcoreinfo_append_str() function expects "long unsigned int" type as a second argument(0x%lx) to print a beginning of vmalloc start address which is defined as a VMALLOC_START macro. For some architectures it can be considered as "int" type, for example m68 generates a compile warning message. To fix it cast a second argument to "unsigned long". Link: https://lkml.kernel.org/r/20240111192329.449189-1-urezki@gmail.com Fixes: 9bdb180b2db6 ("mm/vmalloc: remove vmap_area_list") Signed-off-by: Uladzislau Rezki (Sony) Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202401120218.y469Puyf-lkp@intel.com/ Acked-by: Baoquan He Reviewed-by: Anshuman Khandual Reviewed-by: Lorenzo Stoakes Signed-off-by: Andrew Morton --- kernel/crash_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/crash_core.c b/kernel/crash_core.c index b60de490c1fccb..49b31e59d3ccd1 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -748,7 +748,7 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_SYMBOL_ARRAY(swapper_pg_dir); #endif VMCOREINFO_SYMBOL(_stext); - vmcoreinfo_append_str("NUMBER(VMALLOC_START)=0x%lx\n", VMALLOC_START); + vmcoreinfo_append_str("NUMBER(VMALLOC_START)=0x%lx\n", (unsigned long) VMALLOC_START); #ifndef CONFIG_NUMA VMCOREINFO_SYMBOL(mem_map); From 431c0b3265072b7c8258abb137a154a15d5c1aab Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Tue, 2 Jan 2024 19:46:28 +0100 Subject: [PATCH 426/707] mm: vmalloc: remove global purge_vmap_area_root rb-tree Similar to busy VA, lazily-freed area is stored to a node it belongs to. Such approach does not require any global locking primitive, instead an access becomes scalable what mitigates a contention. This patch removes a global purge-lock, global purge-tree and global purge list. Link: https://lkml.kernel.org/r/20240102184633.748113-7-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Reviewed-by: Baoquan He Cc: Christoph Hellwig Cc: Dave Chinner Cc: Joel Fernandes (Google) Cc: Kazuhito Hagio Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Oleksiy Avramchenko Cc: Paul E. McKenney Signed-off-by: Andrew Morton --- mm/vmalloc.c | 135 +++++++++++++++++++++++++++++++-------------------- 1 file changed, 82 insertions(+), 53 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index b5882790da0088..72822aeff55c22 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -731,10 +731,6 @@ EXPORT_SYMBOL(vmalloc_to_pfn); static DEFINE_SPINLOCK(free_vmap_area_lock); static bool vmap_initialized __read_mostly; -static struct rb_root purge_vmap_area_root = RB_ROOT; -static LIST_HEAD(purge_vmap_area_list); -static DEFINE_SPINLOCK(purge_vmap_area_lock); - /* * This kmem_cache is used for vmap_area objects. Instead of * allocating from slab we reuse an object from this cache to @@ -782,6 +778,12 @@ struct rb_list { static struct vmap_node { /* Bookkeeping data of this node. */ struct rb_list busy; + struct rb_list lazy; + + /* + * Ready-to-free areas. + */ + struct list_head purge_list; } single; static struct vmap_node *vmap_nodes = &single; @@ -1766,40 +1768,22 @@ static DEFINE_MUTEX(vmap_purge_lock); /* for per-CPU blocks */ static void purge_fragmented_blocks_allcpus(void); +static cpumask_t purge_nodes; /* * Purges all lazily-freed vmap areas. */ -static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end) +static unsigned long +purge_vmap_node(struct vmap_node *vn) { - unsigned long resched_threshold; - unsigned int num_purged_areas = 0; - struct list_head local_purge_list; + unsigned long num_purged_areas = 0; struct vmap_area *va, *n_va; - lockdep_assert_held(&vmap_purge_lock); - - spin_lock(&purge_vmap_area_lock); - purge_vmap_area_root = RB_ROOT; - list_replace_init(&purge_vmap_area_list, &local_purge_list); - spin_unlock(&purge_vmap_area_lock); - - if (unlikely(list_empty(&local_purge_list))) - goto out; - - start = min(start, - list_first_entry(&local_purge_list, - struct vmap_area, list)->va_start); - - end = max(end, - list_last_entry(&local_purge_list, - struct vmap_area, list)->va_end); - - flush_tlb_kernel_range(start, end); - resched_threshold = lazy_max_pages() << 1; + if (list_empty(&vn->purge_list)) + return 0; spin_lock(&free_vmap_area_lock); - list_for_each_entry_safe(va, n_va, &local_purge_list, list) { + list_for_each_entry_safe(va, n_va, &vn->purge_list, list) { unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT; unsigned long orig_start = va->va_start; unsigned long orig_end = va->va_end; @@ -1821,13 +1805,55 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end) atomic_long_sub(nr, &vmap_lazy_nr); num_purged_areas++; - - if (atomic_long_read(&vmap_lazy_nr) < resched_threshold) - cond_resched_lock(&free_vmap_area_lock); } spin_unlock(&free_vmap_area_lock); -out: + return num_purged_areas; +} + +/* + * Purges all lazily-freed vmap areas. + */ +static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end) +{ + unsigned long num_purged_areas = 0; + struct vmap_node *vn; + int i; + + lockdep_assert_held(&vmap_purge_lock); + purge_nodes = CPU_MASK_NONE; + + for (i = 0; i < nr_vmap_nodes; i++) { + vn = &vmap_nodes[i]; + + INIT_LIST_HEAD(&vn->purge_list); + + if (RB_EMPTY_ROOT(&vn->lazy.root)) + continue; + + spin_lock(&vn->lazy.lock); + WRITE_ONCE(vn->lazy.root.rb_node, NULL); + list_replace_init(&vn->lazy.head, &vn->purge_list); + spin_unlock(&vn->lazy.lock); + + start = min(start, list_first_entry(&vn->purge_list, + struct vmap_area, list)->va_start); + + end = max(end, list_last_entry(&vn->purge_list, + struct vmap_area, list)->va_end); + + cpumask_set_cpu(i, &purge_nodes); + } + + if (cpumask_weight(&purge_nodes) > 0) { + flush_tlb_kernel_range(start, end); + + for_each_cpu(i, &purge_nodes) { + vn = &nodes[i]; + num_purged_areas += purge_vmap_node(vn); + } + } + trace_purge_vmap_area_lazy(start, end, num_purged_areas); return num_purged_areas > 0; } @@ -1846,16 +1872,9 @@ static void reclaim_and_purge_vmap_areas(void) static void drain_vmap_area_work(struct work_struct *work) { - unsigned long nr_lazy; - - do { - mutex_lock(&vmap_purge_lock); - __purge_vmap_area_lazy(ULONG_MAX, 0); - mutex_unlock(&vmap_purge_lock); - - /* Recheck if further work is required. */ - nr_lazy = atomic_long_read(&vmap_lazy_nr); - } while (nr_lazy > lazy_max_pages()); + mutex_lock(&vmap_purge_lock); + __purge_vmap_area_lazy(ULONG_MAX, 0); + mutex_unlock(&vmap_purge_lock); } /* @@ -1865,6 +1884,7 @@ static void drain_vmap_area_work(struct work_struct *work) */ static void free_vmap_area_noflush(struct vmap_area *va) { + struct vmap_node *vn = addr_to_node(va->va_start); unsigned long nr_lazy_max = lazy_max_pages(); unsigned long va_start = va->va_start; unsigned long nr_lazy; @@ -1878,10 +1898,9 @@ static void free_vmap_area_noflush(struct vmap_area *va) /* * Merge or place it to the purge tree/list. */ - spin_lock(&purge_vmap_area_lock); - merge_or_add_vmap_area(va, - &purge_vmap_area_root, &purge_vmap_area_list); - spin_unlock(&purge_vmap_area_lock); + spin_lock(&vn->lazy.lock); + merge_or_add_vmap_area(va, &vn->lazy.root, &vn->lazy.head); + spin_unlock(&vn->lazy.lock); trace_free_vmap_area_noflush(va_start, nr_lazy, nr_lazy_max); @@ -4411,15 +4430,21 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v) static void show_purge_info(struct seq_file *m) { + struct vmap_node *vn; struct vmap_area *va; + int i; - spin_lock(&purge_vmap_area_lock); - list_for_each_entry(va, &purge_vmap_area_list, list) { - seq_printf(m, "0x%pK-0x%pK %7ld unpurged vm_area\n", - (void *)va->va_start, (void *)va->va_end, - va->va_end - va->va_start); + for (i = 0; i < nr_vmap_nodes; i++) { + vn = &vmap_nodes[i]; + + spin_lock(&vn->lazy.lock); + list_for_each_entry(va, &vn->lazy.head, list) { + seq_printf(m, "0x%pK-0x%pK %7ld unpurged vm_area\n", + (void *)va->va_start, (void *)va->va_end, + va->va_end - va->va_start); + } + spin_unlock(&vn->lazy.lock); } - spin_unlock(&purge_vmap_area_lock); } static int s_show(struct seq_file *m, void *p) @@ -4558,6 +4583,10 @@ static void vmap_init_nodes(void) vn->busy.root = RB_ROOT; INIT_LIST_HEAD(&vn->busy.head); spin_lock_init(&vn->busy.lock); + + vn->lazy.root = RB_ROOT; + INIT_LIST_HEAD(&vn->lazy.head); + spin_lock_init(&vn->lazy.lock); } } From 7db166b4aa0d930d041ae9ff1ad1f706c608449e Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Tue, 2 Jan 2024 19:46:29 +0100 Subject: [PATCH 427/707] mm: vmalloc: offload free_vmap_area_lock lock Concurrent access to a global vmap space is a bottle-neck. We can simulate a high contention by running a vmalloc test suite. To address it, introduce an effective vmap node logic. Each node behaves as independent entity. When a node is accessed it serves a request directly(if possible) from its pool. This model has a size based pool for requests, i.e. pools are serialized and populated based on object size and real demand. A maximum object size that pool can handle is set to 256 pages. This technique reduces a pressure on the global vmap lock. Link: https://lkml.kernel.org/r/20240102184633.748113-8-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Cc: Baoquan He Cc: Christoph Hellwig Cc: Dave Chinner Cc: Joel Fernandes (Google) Cc: Kazuhito Hagio Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Oleksiy Avramchenko Cc: Paul E. McKenney Signed-off-by: Andrew Morton --- mm/vmalloc.c | 387 +++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 342 insertions(+), 45 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 72822aeff55c22..e8b9621ea02b46 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -775,7 +775,22 @@ struct rb_list { spinlock_t lock; }; +struct vmap_pool { + struct list_head head; + unsigned long len; +}; + +/* + * A fast size storage contains VAs up to 1M size. + */ +#define MAX_VA_SIZE_PAGES 256 + static struct vmap_node { + /* Simple size segregated storage. */ + struct vmap_pool pool[MAX_VA_SIZE_PAGES]; + spinlock_t pool_lock; + bool skip_populate; + /* Bookkeeping data of this node. */ struct rb_list busy; struct rb_list lazy; @@ -784,6 +799,8 @@ static struct vmap_node { * Ready-to-free areas. */ struct list_head purge_list; + struct work_struct purge_work; + unsigned long nr_purged; } single; static struct vmap_node *vmap_nodes = &single; @@ -802,6 +819,61 @@ addr_to_node(unsigned long addr) return &vmap_nodes[addr_to_node_id(addr)]; } +static inline struct vmap_node * +id_to_node(unsigned int id) +{ + return &vmap_nodes[id % nr_vmap_nodes]; +} + +/* + * We use the value 0 to represent "no node", that is why + * an encoded value will be the node-id incremented by 1. + * It is always greater then 0. A valid node_id which can + * be encoded is [0:nr_vmap_nodes - 1]. If a passed node_id + * is not valid 0 is returned. + */ +static unsigned int +encode_vn_id(unsigned int node_id) +{ + /* Can store U8_MAX [0:254] nodes. */ + if (node_id < nr_vmap_nodes) + return (node_id + 1) << BITS_PER_BYTE; + + /* Warn and no node encoded. */ + WARN_ONCE(1, "Encode wrong node id (%u)\n", node_id); + return 0; +} + +/* + * Returns an encoded node-id, the valid range is within + * [0:nr_vmap_nodes-1] values. Otherwise nr_vmap_nodes is + * returned if extracted data is wrong. + */ +static unsigned int +decode_vn_id(unsigned int val) +{ + unsigned int node_id = (val >> BITS_PER_BYTE) - 1; + + /* Can store U8_MAX [0:254] nodes. */ + if (node_id < nr_vmap_nodes) + return node_id; + + /* If it was _not_ zero, warn. */ + WARN_ONCE(node_id != UINT_MAX, + "Decode wrong node id (%d)\n", node_id); + + return nr_vmap_nodes; +} + +static bool +is_vn_id_valid(unsigned int node_id) +{ + if (node_id < nr_vmap_nodes) + return true; + + return false; +} + static __always_inline unsigned long va_size(struct vmap_area *va) { @@ -1623,6 +1695,104 @@ preload_this_cpu_lock(spinlock_t *lock, gfp_t gfp_mask, int node) kmem_cache_free(vmap_area_cachep, va); } +static struct vmap_pool * +size_to_va_pool(struct vmap_node *vn, unsigned long size) +{ + unsigned int idx = (size - 1) / PAGE_SIZE; + + if (idx < MAX_VA_SIZE_PAGES) + return &vn->pool[idx]; + + return NULL; +} + +static bool +node_pool_add_va(struct vmap_node *n, struct vmap_area *va) +{ + struct vmap_pool *vp; + + vp = size_to_va_pool(n, va_size(va)); + if (!vp) + return false; + + spin_lock(&n->pool_lock); + list_add(&va->list, &vp->head); + WRITE_ONCE(vp->len, vp->len + 1); + spin_unlock(&n->pool_lock); + + return true; +} + +static struct vmap_area * +node_pool_del_va(struct vmap_node *vn, unsigned long size, + unsigned long align, unsigned long vstart, + unsigned long vend) +{ + struct vmap_area *va = NULL; + struct vmap_pool *vp; + int err = 0; + + vp = size_to_va_pool(vn, size); + if (!vp || list_empty(&vp->head)) + return NULL; + + spin_lock(&vn->pool_lock); + if (!list_empty(&vp->head)) { + va = list_first_entry(&vp->head, struct vmap_area, list); + + if (IS_ALIGNED(va->va_start, align)) { + /* + * Do some sanity check and emit a warning + * if one of below checks detects an error. + */ + err |= (va_size(va) != size); + err |= (va->va_start < vstart); + err |= (va->va_end > vend); + + if (!WARN_ON_ONCE(err)) { + list_del_init(&va->list); + WRITE_ONCE(vp->len, vp->len - 1); + } else { + va = NULL; + } + } else { + list_move_tail(&va->list, &vp->head); + va = NULL; + } + } + spin_unlock(&vn->pool_lock); + + return va; +} + +static struct vmap_area * +node_alloc(unsigned long size, unsigned long align, + unsigned long vstart, unsigned long vend, + unsigned long *addr, unsigned int *vn_id) +{ + struct vmap_area *va; + + *vn_id = 0; + *addr = vend; + + /* + * Fallback to a global heap if not vmalloc or there + * is only one node. + */ + if (vstart != VMALLOC_START || vend != VMALLOC_END || + nr_vmap_nodes == 1) + return NULL; + + *vn_id = raw_smp_processor_id() % nr_vmap_nodes; + va = node_pool_del_va(id_to_node(*vn_id), size, align, vstart, vend); + *vn_id = encode_vn_id(*vn_id); + + if (va) + *addr = va->va_start; + + return va; +} + /* * Allocate a region of KVA of the specified size and alignment, within the * vstart and vend. @@ -1637,6 +1807,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, struct vmap_area *va; unsigned long freed; unsigned long addr; + unsigned int vn_id; int purged = 0; int ret; @@ -1647,11 +1818,23 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, return ERR_PTR(-EBUSY); might_sleep(); - gfp_mask = gfp_mask & GFP_RECLAIM_MASK; - va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node); - if (unlikely(!va)) - return ERR_PTR(-ENOMEM); + /* + * If a VA is obtained from a global heap(if it fails here) + * it is anyway marked with this "vn_id" so it is returned + * to this pool's node later. Such way gives a possibility + * to populate pools based on users demand. + * + * On success a ready to go VA is returned. + */ + va = node_alloc(size, align, vstart, vend, &addr, &vn_id); + if (!va) { + gfp_mask = gfp_mask & GFP_RECLAIM_MASK; + + va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node); + if (unlikely(!va)) + return ERR_PTR(-ENOMEM); + } /* * Only scan the relevant parts containing pointers to other objects @@ -1660,10 +1843,12 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask); retry: - preload_this_cpu_lock(&free_vmap_area_lock, gfp_mask, node); - addr = __alloc_vmap_area(&free_vmap_area_root, &free_vmap_area_list, - size, align, vstart, vend); - spin_unlock(&free_vmap_area_lock); + if (addr == vend) { + preload_this_cpu_lock(&free_vmap_area_lock, gfp_mask, node); + addr = __alloc_vmap_area(&free_vmap_area_root, &free_vmap_area_list, + size, align, vstart, vend); + spin_unlock(&free_vmap_area_lock); + } trace_alloc_vmap_area(addr, size, align, vstart, vend, addr == vend); @@ -1677,7 +1862,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, va->va_start = addr; va->va_end = addr + size; va->vm = NULL; - va->flags = va_flags; + va->flags = (va_flags | vn_id); vn = addr_to_node(va->va_start); @@ -1770,63 +1955,135 @@ static DEFINE_MUTEX(vmap_purge_lock); static void purge_fragmented_blocks_allcpus(void); static cpumask_t purge_nodes; -/* - * Purges all lazily-freed vmap areas. - */ -static unsigned long -purge_vmap_node(struct vmap_node *vn) +static void +reclaim_list_global(struct list_head *head) { - unsigned long num_purged_areas = 0; - struct vmap_area *va, *n_va; + struct vmap_area *va, *n; - if (list_empty(&vn->purge_list)) - return 0; + if (list_empty(head)) + return; spin_lock(&free_vmap_area_lock); + list_for_each_entry_safe(va, n, head, list) + merge_or_add_vmap_area_augment(va, + &free_vmap_area_root, &free_vmap_area_list); + spin_unlock(&free_vmap_area_lock); +} + +static void +decay_va_pool_node(struct vmap_node *vn, bool full_decay) +{ + struct vmap_area *va, *nva; + struct list_head decay_list; + struct rb_root decay_root; + unsigned long n_decay; + int i; + + decay_root = RB_ROOT; + INIT_LIST_HEAD(&decay_list); + + for (i = 0; i < MAX_VA_SIZE_PAGES; i++) { + struct list_head tmp_list; + + if (list_empty(&vn->pool[i].head)) + continue; + + INIT_LIST_HEAD(&tmp_list); + + /* Detach the pool, so no-one can access it. */ + spin_lock(&vn->pool_lock); + list_replace_init(&vn->pool[i].head, &tmp_list); + spin_unlock(&vn->pool_lock); + + if (full_decay) + WRITE_ONCE(vn->pool[i].len, 0); + + /* Decay a pool by ~25% out of left objects. */ + n_decay = vn->pool[i].len >> 2; + + list_for_each_entry_safe(va, nva, &tmp_list, list) { + list_del_init(&va->list); + merge_or_add_vmap_area(va, &decay_root, &decay_list); + + if (!full_decay) { + WRITE_ONCE(vn->pool[i].len, vn->pool[i].len - 1); + + if (!--n_decay) + break; + } + } + + /* Attach the pool back if it has been partly decayed. */ + if (!full_decay && !list_empty(&tmp_list)) { + spin_lock(&vn->pool_lock); + list_replace_init(&tmp_list, &vn->pool[i].head); + spin_unlock(&vn->pool_lock); + } + } + + reclaim_list_global(&decay_list); +} + +static void purge_vmap_node(struct work_struct *work) +{ + struct vmap_node *vn = container_of(work, + struct vmap_node, purge_work); + struct vmap_area *va, *n_va; + LIST_HEAD(local_list); + + vn->nr_purged = 0; + list_for_each_entry_safe(va, n_va, &vn->purge_list, list) { unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT; unsigned long orig_start = va->va_start; unsigned long orig_end = va->va_end; + unsigned int vn_id = decode_vn_id(va->flags); - /* - * Finally insert or merge lazily-freed area. It is - * detached and there is no need to "unlink" it from - * anything. - */ - va = merge_or_add_vmap_area_augment(va, &free_vmap_area_root, - &free_vmap_area_list); - - if (!va) - continue; + list_del_init(&va->list); if (is_vmalloc_or_module_addr((void *)orig_start)) kasan_release_vmalloc(orig_start, orig_end, va->va_start, va->va_end); atomic_long_sub(nr, &vmap_lazy_nr); - num_purged_areas++; + vn->nr_purged++; + + if (is_vn_id_valid(vn_id) && !vn->skip_populate) + if (node_pool_add_va(vn, va)) + continue; + + /* Go back to global. */ + list_add(&va->list, &local_list); } - spin_unlock(&free_vmap_area_lock); - return num_purged_areas; + reclaim_list_global(&local_list); } /* * Purges all lazily-freed vmap areas. */ -static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end) +static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end, + bool full_pool_decay) { - unsigned long num_purged_areas = 0; + unsigned long nr_purged_areas = 0; + unsigned int nr_purge_helpers; + unsigned int nr_purge_nodes; struct vmap_node *vn; int i; lockdep_assert_held(&vmap_purge_lock); + + /* + * Use cpumask to mark which node has to be processed. + */ purge_nodes = CPU_MASK_NONE; for (i = 0; i < nr_vmap_nodes; i++) { vn = &vmap_nodes[i]; INIT_LIST_HEAD(&vn->purge_list); + vn->skip_populate = full_pool_decay; + decay_va_pool_node(vn, full_pool_decay); if (RB_EMPTY_ROOT(&vn->lazy.root)) continue; @@ -1845,17 +2102,45 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end) cpumask_set_cpu(i, &purge_nodes); } - if (cpumask_weight(&purge_nodes) > 0) { + nr_purge_nodes = cpumask_weight(&purge_nodes); + if (nr_purge_nodes > 0) { flush_tlb_kernel_range(start, end); + /* One extra worker is per a lazy_max_pages() full set minus one. */ + nr_purge_helpers = atomic_long_read(&vmap_lazy_nr) / lazy_max_pages(); + nr_purge_helpers = clamp(nr_purge_helpers, 1U, nr_purge_nodes) - 1; + for_each_cpu(i, &purge_nodes) { - vn = &nodes[i]; - num_purged_areas += purge_vmap_node(vn); + vn = &vmap_nodes[i]; + + if (nr_purge_helpers > 0) { + INIT_WORK(&vn->purge_work, purge_vmap_node); + + if (cpumask_test_cpu(i, cpu_online_mask)) + schedule_work_on(i, &vn->purge_work); + else + schedule_work(&vn->purge_work); + + nr_purge_helpers--; + } else { + vn->purge_work.func = NULL; + purge_vmap_node(&vn->purge_work); + nr_purged_areas += vn->nr_purged; + } + } + + for_each_cpu(i, &purge_nodes) { + vn = &vmap_nodes[i]; + + if (vn->purge_work.func) { + flush_work(&vn->purge_work); + nr_purged_areas += vn->nr_purged; + } } } - trace_purge_vmap_area_lazy(start, end, num_purged_areas); - return num_purged_areas > 0; + trace_purge_vmap_area_lazy(start, end, nr_purged_areas); + return nr_purged_areas > 0; } /* @@ -1866,14 +2151,14 @@ static void reclaim_and_purge_vmap_areas(void) { mutex_lock(&vmap_purge_lock); purge_fragmented_blocks_allcpus(); - __purge_vmap_area_lazy(ULONG_MAX, 0); + __purge_vmap_area_lazy(ULONG_MAX, 0, true); mutex_unlock(&vmap_purge_lock); } static void drain_vmap_area_work(struct work_struct *work) { mutex_lock(&vmap_purge_lock); - __purge_vmap_area_lazy(ULONG_MAX, 0); + __purge_vmap_area_lazy(ULONG_MAX, 0, false); mutex_unlock(&vmap_purge_lock); } @@ -1884,9 +2169,10 @@ static void drain_vmap_area_work(struct work_struct *work) */ static void free_vmap_area_noflush(struct vmap_area *va) { - struct vmap_node *vn = addr_to_node(va->va_start); unsigned long nr_lazy_max = lazy_max_pages(); unsigned long va_start = va->va_start; + unsigned int vn_id = decode_vn_id(va->flags); + struct vmap_node *vn; unsigned long nr_lazy; if (WARN_ON_ONCE(!list_empty(&va->list))) @@ -1896,10 +2182,14 @@ static void free_vmap_area_noflush(struct vmap_area *va) PAGE_SHIFT, &vmap_lazy_nr); /* - * Merge or place it to the purge tree/list. + * If it was request by a certain node we would like to + * return it to that node, i.e. its pool for later reuse. */ + vn = is_vn_id_valid(vn_id) ? + id_to_node(vn_id):addr_to_node(va->va_start); + spin_lock(&vn->lazy.lock); - merge_or_add_vmap_area(va, &vn->lazy.root, &vn->lazy.head); + insert_vmap_area(va, &vn->lazy.root, &vn->lazy.head); spin_unlock(&vn->lazy.lock); trace_free_vmap_area_noflush(va_start, nr_lazy, nr_lazy_max); @@ -2408,7 +2698,7 @@ static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush) } free_purged_blocks(&purge_list); - if (!__purge_vmap_area_lazy(start, end) && flush) + if (!__purge_vmap_area_lazy(start, end, false) && flush) flush_tlb_kernel_range(start, end); mutex_unlock(&vmap_purge_lock); } @@ -4576,7 +4866,7 @@ static void __init vmap_init_free_space(void) static void vmap_init_nodes(void) { struct vmap_node *vn; - int i; + int i, j; for (i = 0; i < nr_vmap_nodes; i++) { vn = &vmap_nodes[i]; @@ -4587,6 +4877,13 @@ static void vmap_init_nodes(void) vn->lazy.root = RB_ROOT; INIT_LIST_HEAD(&vn->lazy.head); spin_lock_init(&vn->lazy.lock); + + for (j = 0; j < MAX_VA_SIZE_PAGES; j++) { + INIT_LIST_HEAD(&vn->pool[j].head); + WRITE_ONCE(vn->pool[j].len, 0); + } + + spin_lock_init(&vn->pool_lock); } } From d77b8e651af5694bfbcc8c9a10b24f1bbb1410ed Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Tue, 2 Jan 2024 19:46:30 +0100 Subject: [PATCH 428/707] mm: vmalloc: support multiple nodes in vread_iter Extend the vread_iter() to be able to perform a sequential reading of VAs which are spread among multiple nodes. So a data read over the /dev/kmem correctly reflects a vmalloc memory layout. Link: https://lkml.kernel.org/r/20240102184633.748113-9-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Reviewed-by: Baoquan He Cc: Christoph Hellwig Cc: Dave Chinner Cc: Joel Fernandes (Google) Cc: Kazuhito Hagio Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Oleksiy Avramchenko Cc: Paul E. McKenney Signed-off-by: Andrew Morton --- mm/vmalloc.c | 67 +++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 53 insertions(+), 14 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index e8b9621ea02b46..8b0cad0e2aefa7 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -906,7 +906,7 @@ unsigned long vmalloc_nr_pages(void) /* Look up the first VA which satisfies addr < va_end, NULL if none. */ static struct vmap_area * -find_vmap_area_exceed_addr(unsigned long addr, struct rb_root *root) +__find_vmap_area_exceed_addr(unsigned long addr, struct rb_root *root) { struct vmap_area *va = NULL; struct rb_node *n = root->rb_node; @@ -930,6 +930,41 @@ find_vmap_area_exceed_addr(unsigned long addr, struct rb_root *root) return va; } +/* + * Returns a node where a first VA, that satisfies addr < va_end, resides. + * If success, a node is locked. A user is responsible to unlock it when a + * VA is no longer needed to be accessed. + * + * Returns NULL if nothing found. + */ +static struct vmap_node * +find_vmap_area_exceed_addr_lock(unsigned long addr, struct vmap_area **va) +{ + struct vmap_node *vn, *va_node = NULL; + struct vmap_area *va_lowest; + int i; + + for (i = 0; i < nr_vmap_nodes; i++) { + vn = &vmap_nodes[i]; + + spin_lock(&vn->busy.lock); + va_lowest = __find_vmap_area_exceed_addr(addr, &vn->busy.root); + if (va_lowest) { + if (!va_node || va_lowest->va_start < (*va)->va_start) { + if (va_node) + spin_unlock(&va_node->busy.lock); + + *va = va_lowest; + va_node = vn; + continue; + } + } + spin_unlock(&vn->busy.lock); + } + + return va_node; +} + static struct vmap_area *__find_vmap_area(unsigned long addr, struct rb_root *root) { struct rb_node *n = root->rb_node; @@ -4102,6 +4137,7 @@ long vread_iter(struct iov_iter *iter, const char *addr, size_t count) struct vm_struct *vm; char *vaddr; size_t n, size, flags, remains; + unsigned long next; addr = kasan_reset_tag(addr); @@ -4111,19 +4147,15 @@ long vread_iter(struct iov_iter *iter, const char *addr, size_t count) remains = count; - /* Hooked to node_0 so far. */ - vn = addr_to_node(0); - spin_lock(&vn->busy.lock); - - va = find_vmap_area_exceed_addr((unsigned long)addr, &vn->busy.root); - if (!va) + vn = find_vmap_area_exceed_addr_lock((unsigned long) addr, &va); + if (!vn) goto finished_zero; /* no intersects with alive vmap_area */ if ((unsigned long)addr + remains <= va->va_start) goto finished_zero; - list_for_each_entry_from(va, &vn->busy.head, list) { + do { size_t copied; if (remains == 0) @@ -4138,10 +4170,10 @@ long vread_iter(struct iov_iter *iter, const char *addr, size_t count) WARN_ON(flags == VMAP_BLOCK); if (!vm && !flags) - continue; + goto next_va; if (vm && (vm->flags & VM_UNINITIALIZED)) - continue; + goto next_va; /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */ smp_rmb(); @@ -4150,7 +4182,7 @@ long vread_iter(struct iov_iter *iter, const char *addr, size_t count) size = vm ? get_vm_area_size(vm) : va_size(va); if (addr >= vaddr + size) - continue; + goto next_va; if (addr < vaddr) { size_t to_zero = min_t(size_t, vaddr - addr, remains); @@ -4179,15 +4211,22 @@ long vread_iter(struct iov_iter *iter, const char *addr, size_t count) if (copied != n) goto finished; - } + + next_va: + next = va->va_end; + spin_unlock(&vn->busy.lock); + } while ((vn = find_vmap_area_exceed_addr_lock(next, &va))); finished_zero: - spin_unlock(&vn->busy.lock); + if (vn) + spin_unlock(&vn->busy.lock); + /* zero-fill memory holes */ return count - remains + zero_iter(iter, remains); finished: /* Nothing remains, or We couldn't copy/zero everything. */ - spin_unlock(&vn->busy.lock); + if (vn) + spin_unlock(&vn->busy.lock); return count - remains; } From 478163dd5cac116b724b5ceb7c18c19c843a77ec Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Tue, 2 Jan 2024 19:46:31 +0100 Subject: [PATCH 429/707] mm: vmalloc: support multiple nodes in vmallocinfo Allocated areas are spread among nodes, it implies that the scanning has to be performed individually of each node in order to dump all existing VAs. Link: https://lkml.kernel.org/r/20240102184633.748113-10-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Cc: Baoquan He Cc: Christoph Hellwig Cc: Dave Chinner Cc: Joel Fernandes (Google) Cc: Kazuhito Hagio Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Oleksiy Avramchenko Cc: Paul E. McKenney Signed-off-by: Andrew Morton --- mm/vmalloc.c | 120 ++++++++++++++++++++------------------------------- 1 file changed, 47 insertions(+), 73 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 8b0cad0e2aefa7..f0aaf926e3ccd2 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -4709,30 +4709,6 @@ bool vmalloc_dump_obj(void *object) #endif #ifdef CONFIG_PROC_FS -static void *s_start(struct seq_file *m, loff_t *pos) -{ - struct vmap_node *vn = addr_to_node(0); - - mutex_lock(&vmap_purge_lock); - spin_lock(&vn->busy.lock); - - return seq_list_start(&vn->busy.head, *pos); -} - -static void *s_next(struct seq_file *m, void *p, loff_t *pos) -{ - struct vmap_node *vn = addr_to_node(0); - return seq_list_next(p, &vn->busy.head, pos); -} - -static void s_stop(struct seq_file *m, void *p) -{ - struct vmap_node *vn = addr_to_node(0); - - spin_unlock(&vn->busy.lock); - mutex_unlock(&vmap_purge_lock); -} - static void show_numa_info(struct seq_file *m, struct vm_struct *v) { if (IS_ENABLED(CONFIG_NUMA)) { @@ -4776,84 +4752,82 @@ static void show_purge_info(struct seq_file *m) } } -static int s_show(struct seq_file *m, void *p) +static int vmalloc_info_show(struct seq_file *m, void *p) { struct vmap_node *vn; struct vmap_area *va; struct vm_struct *v; + int i; - vn = addr_to_node(0); - va = list_entry(p, struct vmap_area, list); + for (i = 0; i < nr_vmap_nodes; i++) { + vn = &vmap_nodes[i]; - if (!va->vm) { - if (va->flags & VMAP_RAM) - seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n", - (void *)va->va_start, (void *)va->va_end, - va->va_end - va->va_start); + spin_lock(&vn->busy.lock); + list_for_each_entry(va, &vn->busy.head, list) { + if (!va->vm) { + if (va->flags & VMAP_RAM) + seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n", + (void *)va->va_start, (void *)va->va_end, + va->va_end - va->va_start); - goto final; - } + continue; + } - v = va->vm; + v = va->vm; - seq_printf(m, "0x%pK-0x%pK %7ld", - v->addr, v->addr + v->size, v->size); + seq_printf(m, "0x%pK-0x%pK %7ld", + v->addr, v->addr + v->size, v->size); - if (v->caller) - seq_printf(m, " %pS", v->caller); + if (v->caller) + seq_printf(m, " %pS", v->caller); - if (v->nr_pages) - seq_printf(m, " pages=%d", v->nr_pages); + if (v->nr_pages) + seq_printf(m, " pages=%d", v->nr_pages); - if (v->phys_addr) - seq_printf(m, " phys=%pa", &v->phys_addr); + if (v->phys_addr) + seq_printf(m, " phys=%pa", &v->phys_addr); - if (v->flags & VM_IOREMAP) - seq_puts(m, " ioremap"); + if (v->flags & VM_IOREMAP) + seq_puts(m, " ioremap"); - if (v->flags & VM_ALLOC) - seq_puts(m, " vmalloc"); + if (v->flags & VM_ALLOC) + seq_puts(m, " vmalloc"); - if (v->flags & VM_MAP) - seq_puts(m, " vmap"); + if (v->flags & VM_MAP) + seq_puts(m, " vmap"); - if (v->flags & VM_USERMAP) - seq_puts(m, " user"); + if (v->flags & VM_USERMAP) + seq_puts(m, " user"); - if (v->flags & VM_DMA_COHERENT) - seq_puts(m, " dma-coherent"); + if (v->flags & VM_DMA_COHERENT) + seq_puts(m, " dma-coherent"); - if (is_vmalloc_addr(v->pages)) - seq_puts(m, " vpages"); + if (is_vmalloc_addr(v->pages)) + seq_puts(m, " vpages"); - show_numa_info(m, v); - seq_putc(m, '\n'); + show_numa_info(m, v); + seq_putc(m, '\n'); + } + spin_unlock(&vn->busy.lock); + } /* * As a final step, dump "unpurged" areas. */ -final: - if (list_is_last(&va->list, &vn->busy.head)) - show_purge_info(m); - + show_purge_info(m); return 0; } -static const struct seq_operations vmalloc_op = { - .start = s_start, - .next = s_next, - .stop = s_stop, - .show = s_show, -}; - static int __init proc_vmalloc_init(void) { + void *priv_data = NULL; + if (IS_ENABLED(CONFIG_NUMA)) - proc_create_seq_private("vmallocinfo", 0400, NULL, - &vmalloc_op, - nr_node_ids * sizeof(unsigned int), NULL); - else - proc_create_seq("vmallocinfo", 0400, NULL, &vmalloc_op); + priv_data = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL); + + proc_create_single_data("vmallocinfo", + 0400, NULL, vmalloc_info_show, priv_data); + return 0; } module_init(proc_vmalloc_init); From a3822923af51de08df5dce8c32ce7c194b7016f4 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Tue, 2 Jan 2024 19:46:32 +0100 Subject: [PATCH 430/707] mm: vmalloc: set nr_nodes based on CPUs in a system A number of nodes which are used in the alloc/free paths is set based on num_possible_cpus() in a system. Please note a high limit threshold though is fixed and corresponds to 128 nodes. For 32-bit or single core systems an access to a global vmap heap is not balanced. Such small systems do not suffer from lock contentions due to low number of CPUs. In such case the nr_nodes is equal to 1. Test on AMD Ryzen Threadripper 3970X 32-Core Processor: sudo ./test_vmalloc.sh run_test_mask=7 nr_threads=64 94.41% 0.89% [kernel] [k] _raw_spin_lock 93.35% 93.07% [kernel] [k] native_queued_spin_lock_slowpath 76.13% 0.28% [kernel] [k] __vmalloc_node_range 72.96% 0.81% [kernel] [k] alloc_vmap_area 56.94% 0.00% [kernel] [k] __get_vm_area_node 41.95% 0.00% [kernel] [k] vmalloc 37.15% 0.01% [test_vmalloc] [k] full_fit_alloc_test 35.17% 0.00% [kernel] [k] ret_from_fork_asm 35.17% 0.00% [kernel] [k] ret_from_fork 35.17% 0.00% [kernel] [k] kthread 35.08% 0.00% [test_vmalloc] [k] test_func 34.45% 0.00% [test_vmalloc] [k] fix_size_alloc_test 28.09% 0.01% [test_vmalloc] [k] long_busy_list_alloc_test 23.53% 0.25% [kernel] [k] vfree.part.0 21.72% 0.00% [kernel] [k] remove_vm_area 20.08% 0.21% [kernel] [k] find_unlink_vmap_area 2.34% 0.61% [kernel] [k] free_vmap_area_noflush vs 82.32% 0.22% [test_vmalloc] [k] long_busy_list_alloc_test 63.36% 0.02% [kernel] [k] vmalloc 63.34% 2.64% [kernel] [k] __vmalloc_node_range 30.42% 4.46% [kernel] [k] vfree.part.0 28.98% 2.51% [kernel] [k] __alloc_pages_bulk 27.28% 0.19% [kernel] [k] __get_vm_area_node 26.13% 1.50% [kernel] [k] alloc_vmap_area 21.72% 21.67% [kernel] [k] clear_page_rep 19.51% 2.43% [kernel] [k] _raw_spin_lock 16.61% 16.51% [kernel] [k] native_queued_spin_lock_slowpath 13.40% 2.07% [kernel] [k] free_unref_page 10.62% 0.01% [kernel] [k] remove_vm_area 9.02% 8.73% [kernel] [k] insert_vmap_area 8.94% 0.00% [kernel] [k] ret_from_fork_asm 8.94% 0.00% [kernel] [k] ret_from_fork 8.94% 0.00% [kernel] [k] kthread 8.29% 0.00% [test_vmalloc] [k] test_func 7.81% 0.05% [test_vmalloc] [k] full_fit_alloc_test 5.30% 4.73% [kernel] [k] purge_vmap_node 4.47% 2.65% [kernel] [k] free_vmap_area_noflush confirms that a native_queued_spin_lock_slowpath goes down to 16.51% percent from 93.07%. The throughput is ~12x higher: urezki@pc638:~$ time sudo ./test_vmalloc.sh run_test_mask=7 nr_threads=64 Run the test with following parameters: run_test_mask=7 nr_threads=64 Done. Check the kernel ring buffer to see the summary. real 10m51.271s user 0m0.013s sys 0m0.187s urezki@pc638:~$ urezki@pc638:~$ time sudo ./test_vmalloc.sh run_test_mask=7 nr_threads=64 Run the test with following parameters: run_test_mask=7 nr_threads=64 Done. Check the kernel ring buffer to see the summary. real 0m51.301s user 0m0.015s sys 0m0.040s urezki@pc638:~$ Link: https://lkml.kernel.org/r/20240102184633.748113-11-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Cc: Baoquan He Cc: Christoph Hellwig Cc: Dave Chinner Cc: Joel Fernandes (Google) Cc: Kazuhito Hagio Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Oleksiy Avramchenko Cc: Paul E. McKenney Signed-off-by: Andrew Morton --- mm/vmalloc.c | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index f0aaf926e3ccd2..af986953d2d7ed 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -4879,10 +4879,27 @@ static void __init vmap_init_free_space(void) static void vmap_init_nodes(void) { struct vmap_node *vn; - int i, j; + int i, n; + +#if BITS_PER_LONG == 64 + /* A high threshold of max nodes is fixed and bound to 128. */ + n = clamp_t(unsigned int, num_possible_cpus(), 1, 128); + + if (n > 1) { + vn = kmalloc_array(n, sizeof(*vn), GFP_NOWAIT | __GFP_NOWARN); + if (vn) { + /* Node partition is 16 pages. */ + vmap_zone_size = (1 << 4) * PAGE_SIZE; + nr_vmap_nodes = n; + vmap_nodes = vn; + } else { + pr_err("Failed to allocate an array. Disable a node layer\n"); + } + } +#endif - for (i = 0; i < nr_vmap_nodes; i++) { - vn = &vmap_nodes[i]; + for (n = 0; n < nr_vmap_nodes; n++) { + vn = &vmap_nodes[n]; vn->busy.root = RB_ROOT; INIT_LIST_HEAD(&vn->busy.head); spin_lock_init(&vn->busy.lock); @@ -4891,9 +4908,9 @@ static void vmap_init_nodes(void) INIT_LIST_HEAD(&vn->lazy.head); spin_lock_init(&vn->lazy.lock); - for (j = 0; j < MAX_VA_SIZE_PAGES; j++) { - INIT_LIST_HEAD(&vn->pool[j].head); - WRITE_ONCE(vn->pool[j].len, 0); + for (i = 0; i < MAX_VA_SIZE_PAGES; i++) { + INIT_LIST_HEAD(&vn->pool[i].head); + WRITE_ONCE(vn->pool[i].len, 0); } spin_lock_init(&vn->pool_lock); From 84f8958cd667d1c192bd69a0504e59e21dc75680 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Tue, 2 Jan 2024 19:46:33 +0100 Subject: [PATCH 431/707] mm: vmalloc: add a shrinker to drain vmap pools The added shrinker is used to return back current cached VAs into a global vmap space, when a system enters into a low memory mode. Link: https://lkml.kernel.org/r/20240102184633.748113-12-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Cc: Kazuhito Hagio Cc: Baoquan He Cc: Christoph Hellwig Cc: Dave Chinner Cc: Joel Fernandes (Google) Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Oleksiy Avramchenko Cc: Paul E. McKenney Signed-off-by: Andrew Morton --- mm/vmalloc.c | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index af986953d2d7ed..257981e37936cd 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -4917,8 +4917,37 @@ static void vmap_init_nodes(void) } } +static unsigned long +vmap_node_shrink_count(struct shrinker *shrink, struct shrink_control *sc) +{ + unsigned long count; + struct vmap_node *vn; + int i, j; + + for (count = 0, i = 0; i < nr_vmap_nodes; i++) { + vn = &vmap_nodes[i]; + + for (j = 0; j < MAX_VA_SIZE_PAGES; j++) + count += READ_ONCE(vn->pool[j].len); + } + + return count ? count : SHRINK_EMPTY; +} + +static unsigned long +vmap_node_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) +{ + int i; + + for (i = 0; i < nr_vmap_nodes; i++) + decay_va_pool_node(&vmap_nodes[i], true); + + return SHRINK_STOP; +} + void __init vmalloc_init(void) { + struct shrinker *vmap_node_shrinker; struct vmap_area *va; struct vmap_node *vn; struct vm_struct *tmp; @@ -4966,4 +4995,14 @@ void __init vmalloc_init(void) */ vmap_init_free_space(); vmap_initialized = true; + + vmap_node_shrinker = shrinker_alloc(0, "vmap-node"); + if (!vmap_node_shrinker) { + pr_err("Failed to allocate vmap-node shrinker!\n"); + return; + } + + vmap_node_shrinker->count_objects = vmap_node_shrink_count; + vmap_node_shrinker->scan_objects = vmap_node_shrink_scan; + shrinker_register(vmap_node_shrinker); } From 525847c5902cfc4a5620018351cf00fc2fa6e157 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Wed, 24 Jan 2024 19:09:19 +0100 Subject: [PATCH 432/707] mm: vmalloc: improve description of vmap node layer This patch adds extra explanation of recently added vmap node layer based on community feedback. No functional change. Link: https://lkml.kernel.org/r/20240124180920.50725-1-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Reviewed-by: Lorenzo Stoakes Cc: Baoquan He Cc: Christoph Hellwig Cc: Dave Chinner Cc: Matthew Wilcox (Oracle) Cc: Oleksiy Avramchenko Signed-off-by: Andrew Morton --- mm/vmalloc.c | 60 ++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 46 insertions(+), 14 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 257981e37936cd..b8be601b056d8f 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -765,9 +765,10 @@ static struct rb_root free_vmap_area_root = RB_ROOT; static DEFINE_PER_CPU(struct vmap_area *, ne_fit_preload_node); /* - * An effective vmap-node logic. Users make use of nodes instead - * of a global heap. It allows to balance an access and mitigate - * contention. + * This structure defines a single, solid model where a list and + * rb-tree are part of one entity protected by the lock. Nodes are + * sorted in ascending order, thus for O(1) access to left/right + * neighbors a list is used as well as for sequential traversal. */ struct rb_list { struct rb_root root; @@ -775,16 +776,23 @@ struct rb_list { spinlock_t lock; }; +/* + * A fast size storage contains VAs up to 1M size. A pool consists + * of linked between each other ready to go VAs of certain sizes. + * An index in the pool-array corresponds to number of pages + 1. + */ +#define MAX_VA_SIZE_PAGES 256 + struct vmap_pool { struct list_head head; unsigned long len; }; /* - * A fast size storage contains VAs up to 1M size. + * An effective vmap-node logic. Users make use of nodes instead + * of a global heap. It allows to balance an access and mitigate + * contention. */ -#define MAX_VA_SIZE_PAGES 256 - static struct vmap_node { /* Simple size segregated storage. */ struct vmap_pool pool[MAX_VA_SIZE_PAGES]; @@ -803,6 +811,11 @@ static struct vmap_node { unsigned long nr_purged; } single; +/* + * Initial setup consists of one single node, i.e. a balancing + * is fully disabled. Later on, after vmap is initialized these + * parameters are updated based on a system capacity. + */ static struct vmap_node *vmap_nodes = &single; static __read_mostly unsigned int nr_vmap_nodes = 1; static __read_mostly unsigned int vmap_zone_size = 1; @@ -2048,7 +2061,12 @@ decay_va_pool_node(struct vmap_node *vn, bool full_decay) } } - /* Attach the pool back if it has been partly decayed. */ + /* + * Attach the pool back if it has been partly decayed. + * Please note, it is supposed that nobody(other contexts) + * can populate the pool therefore a simple list replace + * operation takes place here. + */ if (!full_decay && !list_empty(&tmp_list)) { spin_lock(&vn->pool_lock); list_replace_init(&tmp_list, &vn->pool[i].head); @@ -2257,16 +2275,14 @@ struct vmap_area *find_vmap_area(unsigned long addr) * An addr_to_node_id(addr) converts an address to a node index * where a VA is located. If VA spans several zones and passed * addr is not the same as va->va_start, what is not common, we - * may need to scan an extra nodes. See an example: + * may need to scan extra nodes. See an example: * - * <--va--> + * <----va----> * -|-----|-----|-----|-----|- * 1 2 0 1 * - * VA resides in node 1 whereas it spans 1 and 2. If passed - * addr is within a second node we should do extra work. We - * should mention that it is rare and is a corner case from - * the other hand it has to be covered. + * VA resides in node 1 whereas it spans 1, 2 an 0. If passed + * addr is within 2 or 0 nodes we should do extra work. */ i = j = addr_to_node_id(addr); do { @@ -2289,6 +2305,9 @@ static struct vmap_area *find_unlink_vmap_area(unsigned long addr) struct vmap_area *va; int i, j; + /* + * Check the comment in the find_vmap_area() about the loop. + */ i = j = addr_to_node_id(addr); do { vn = &vmap_nodes[i]; @@ -4882,7 +4901,20 @@ static void vmap_init_nodes(void) int i, n; #if BITS_PER_LONG == 64 - /* A high threshold of max nodes is fixed and bound to 128. */ + /* + * A high threshold of max nodes is fixed and bound to 128, + * thus a scale factor is 1 for systems where number of cores + * are less or equal to specified threshold. + * + * As for NUMA-aware notes. For bigger systems, for example + * NUMA with multi-sockets, where we can end-up with thousands + * of cores in total, a "sub-numa-clustering" should be added. + * + * In this case a NUMA domain is considered as a single entity + * with dedicated sub-nodes in it which describe one group or + * set of cores. Therefore a per-domain purging is supposed to + * be added as well as a per-domain balancing. + */ n = clamp_t(unsigned int, num_possible_cpus(), 1, 128); if (n > 1) { From ed82e5bbef9c54b438055c9ebfe52d669f1f3fdb Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Wed, 24 Jan 2024 19:09:20 +0100 Subject: [PATCH 433/707] mm: vmalloc: refactor vmalloc_dump_obj() function This patch tends to simplify the function in question, by removing an extra stack "objp" variable, returning back to an early exit approach if spin_trylock() fails or VA was not found. Link: https://lkml.kernel.org/r/20240124180920.50725-2-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Reviewed-by: Lorenzo Stoakes Cc: Baoquan He Cc: Christoph Hellwig Cc: Dave Chinner Cc: Matthew Wilcox (Oracle) Cc: Oleksiy Avramchenko Signed-off-by: Andrew Morton --- mm/vmalloc.c | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index b8be601b056d8f..449f45b0e47497 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -4696,34 +4696,35 @@ void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms) #ifdef CONFIG_PRINTK bool vmalloc_dump_obj(void *object) { - void *objp = (void *)PAGE_ALIGN((unsigned long)object); const void *caller; + struct vm_struct *vm; struct vmap_area *va; struct vmap_node *vn; unsigned long addr; unsigned int nr_pages; - bool success = false; - - vn = addr_to_node((unsigned long)objp); - if (spin_trylock(&vn->busy.lock)) { - va = __find_vmap_area((unsigned long)objp, &vn->busy.root); + addr = PAGE_ALIGN((unsigned long) object); + vn = addr_to_node(addr); - if (va && va->vm) { - addr = (unsigned long)va->vm->addr; - caller = va->vm->caller; - nr_pages = va->vm->nr_pages; - success = true; - } + if (!spin_trylock(&vn->busy.lock)) + return false; + va = __find_vmap_area(addr, &vn->busy.root); + if (!va || !va->vm) { spin_unlock(&vn->busy.lock); + return false; } - if (success) - pr_cont(" %u-page vmalloc region starting at %#lx allocated at %pS\n", - nr_pages, addr, caller); + vm = va->vm; + addr = (unsigned long) vm->addr; + caller = vm->caller; + nr_pages = vm->nr_pages; + spin_unlock(&vn->busy.lock); - return success; + pr_cont(" %u-page vmalloc region starting at %#lx allocated at %pS\n", + nr_pages, addr, caller); + + return true; } #endif From 7edf0a77cf93b772ff8004f42eeb9d6403ecc3e8 Mon Sep 17 00:00:00 2001 From: Yajun Deng Date: Wed, 10 Jan 2024 16:46:22 +0800 Subject: [PATCH 434/707] mm/mmap: simplify vma link and unlink The file parameter in the __remove_shared_vm_struct is no longer used, remove it. These functions vma_link() and mmap_region() have some of the same code, introduce vma_link_file() helper function to simplify the code. Link: https://lkml.kernel.org/r/20240110084622.2425927-1-yajun.deng@linux.dev Signed-off-by: Yajun Deng Signed-off-by: Andrew Morton --- mm/mmap.c | 44 +++++++++++++++++++------------------------- 1 file changed, 19 insertions(+), 25 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index d89770eaab6b61..282ed6d0914b07 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -105,7 +105,7 @@ void vma_set_page_prot(struct vm_area_struct *vma) * Requires inode->i_mapping->i_mmap_rwsem */ static void __remove_shared_vm_struct(struct vm_area_struct *vma, - struct file *file, struct address_space *mapping) + struct address_space *mapping) { if (vma_is_shared_maywrite(vma)) mapping_unmap_writable(mapping); @@ -126,7 +126,7 @@ void unlink_file_vma(struct vm_area_struct *vma) if (file) { struct address_space *mapping = file->f_mapping; i_mmap_lock_write(mapping); - __remove_shared_vm_struct(vma, file, mapping); + __remove_shared_vm_struct(vma, mapping); i_mmap_unlock_write(mapping); } } @@ -392,26 +392,30 @@ static void __vma_link_file(struct vm_area_struct *vma, flush_dcache_mmap_unlock(mapping); } +static void vma_link_file(struct vm_area_struct *vma) +{ + struct file *file = vma->vm_file; + struct address_space *mapping; + + if (file) { + mapping = file->f_mapping; + i_mmap_lock_write(mapping); + __vma_link_file(vma, mapping); + i_mmap_unlock_write(mapping); + } +} + static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma) { VMA_ITERATOR(vmi, mm, 0); - struct address_space *mapping = NULL; vma_iter_config(&vmi, vma->vm_start, vma->vm_end); if (vma_iter_prealloc(&vmi, vma)) return -ENOMEM; vma_start_write(vma); - vma_iter_store(&vmi, vma); - - if (vma->vm_file) { - mapping = vma->vm_file->f_mapping; - i_mmap_lock_write(mapping); - __vma_link_file(vma, mapping); - i_mmap_unlock_write(mapping); - } - + vma_link_file(vma); mm->map_count++; validate_mm(mm); return 0; @@ -519,10 +523,9 @@ static inline void vma_complete(struct vma_prepare *vp, } if (vp->remove && vp->file) { - __remove_shared_vm_struct(vp->remove, vp->file, vp->mapping); + __remove_shared_vm_struct(vp->remove, vp->mapping); if (vp->remove2) - __remove_shared_vm_struct(vp->remove2, vp->file, - vp->mapping); + __remove_shared_vm_struct(vp->remove2, vp->mapping); } else if (vp->insert) { /* * split_vma has split insert from vma, and needs @@ -2891,16 +2894,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, vma_start_write(vma); vma_iter_store(&vmi, vma); mm->map_count++; - if (vma->vm_file) { - i_mmap_lock_write(vma->vm_file->f_mapping); - if (vma_is_shared_maywrite(vma)) - mapping_allow_writable(vma->vm_file->f_mapping); - - flush_dcache_mmap_lock(vma->vm_file->f_mapping); - vma_interval_tree_insert(vma, &vma->vm_file->f_mapping->i_mmap); - flush_dcache_mmap_unlock(vma->vm_file->f_mapping); - i_mmap_unlock_write(vma->vm_file->f_mapping); - } + vma_link_file(vma); /* * vma_merge() calls khugepaged_enter_vma() either, the below From 67dfd41fbce86e1a59593c9e122cf4b094a77f18 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 29 Dec 2023 16:22:07 +0800 Subject: [PATCH 435/707] mm: memory: use nth_page() in clear/copy_subpage() The clear and copy of huge gigantic page has converted to use nth_page() to handle the possible discontinuous struct page(SPARSEMEM without VMEMMAP), but not change for the non-gigantic part, fix it too. Link: https://lkml.kernel.org/r/20231229082207.60235-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: Matthew Wilcox (Oracle) Cc: Zi Yan Cc: David Hildenbrand Signed-off-by: Andrew Morton --- mm/memory.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 89bcae0b224d6d..762458faec6969 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -6143,7 +6143,7 @@ static int clear_subpage(unsigned long addr, int idx, void *arg) { struct page *page = arg; - clear_user_highpage(page + idx, addr); + clear_user_highpage(nth_page(page, idx), addr); return 0; } @@ -6193,10 +6193,11 @@ struct copy_subpage_arg { static int copy_subpage(unsigned long addr, int idx, void *arg) { struct copy_subpage_arg *copy_arg = arg; + struct page *dst = nth_page(copy_arg->dst, idx); + struct page *src = nth_page(copy_arg->src, idx); - if (copy_mc_user_highpage(copy_arg->dst + idx, copy_arg->src + idx, - addr, copy_arg->vma)) { - memory_failure_queue(page_to_pfn(copy_arg->src + idx), 0); + if (copy_mc_user_highpage(dst, src, addr, copy_arg->vma)) { + memory_failure_queue(page_to_pfn(src), 0); return -EHWPOISON; } return 0; From 7269896acbba7b62463b4b14a2fb5f6c98fe07fd Mon Sep 17 00:00:00 2001 From: Haifeng Xu Date: Thu, 28 Dec 2023 06:27:14 +0000 Subject: [PATCH 436/707] mm: list_lru: disable memcg_aware when cgroup.memory is set to "nokmem" Actually, when using a boot time kernel option "cgroup.memory=nokmem", all lru items are inserted to list_lru_node. But for those users who invoke list_lru_init_memcg() to initialize list_lru, list_lru_memcg_aware() returns true. And this brings unneeded operations related to memcg. To make things more convenient, let's disable memcg_aware when cgroup.memory is set to "nokmem". Link: https://lkml.kernel.org/r/20231228062715.338672-1-haifeng.xu@shopee.com Signed-off-by: Haifeng Xu Acked-by: Roman Gushchin Cc: Johannes Weiner Cc: Michal Hocko Cc: Shakeel Butt Signed-off-by: Andrew Morton --- mm/list_lru.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/list_lru.c b/mm/list_lru.c index 35b0147542a9de..158781d1d3c215 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -567,6 +567,9 @@ int __list_lru_init(struct list_lru *lru, bool memcg_aware, lru->shrinker_id = shrinker->id; else lru->shrinker_id = -1; + + if (mem_cgroup_kmem_disabled()) + memcg_aware = false; #endif lru->node = kcalloc(nr_node_ids, sizeof(*lru->node), GFP_KERNEL); From bc72e937a1de5ab1189c42def85494530da20f8e Mon Sep 17 00:00:00 2001 From: Haifeng Xu Date: Thu, 28 Dec 2023 06:27:15 +0000 Subject: [PATCH 437/707] mm: list_lru: remove unused macro list_lru_init_key() list_lru_init_key() isn't used by anyone, remove it to clean up. Link: https://lkml.kernel.org/r/20231228062715.338672-2-haifeng.xu@shopee.com Signed-off-by: Haifeng Xu Acked-by: Roman Gushchin Cc: Johannes Weiner Cc: Michal Hocko Cc: Shakeel Butt Signed-off-by: Andrew Morton --- include/linux/list_lru.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h index 7675a48a070108..c679e6b293c4c4 100644 --- a/include/linux/list_lru.h +++ b/include/linux/list_lru.h @@ -62,8 +62,6 @@ int __list_lru_init(struct list_lru *lru, bool memcg_aware, #define list_lru_init(lru) \ __list_lru_init((lru), false, NULL, NULL) -#define list_lru_init_key(lru, key) \ - __list_lru_init((lru), false, (key), NULL) #define list_lru_init_memcg(lru, shrinker) \ __list_lru_init((lru), true, NULL, shrinker) From f84490c9c892898560d7f58fd8765bdfecb0ad07 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Wed, 20 Dec 2023 22:59:42 -0800 Subject: [PATCH 438/707] mm: mmap: no need to call khugepaged_enter_vma() for stack We avoid allocating THP for temporary stack, even though khugepaged_enter_vma() is called for stack VMAs, it actualy returns false. So no need to call it in the first place at all. Link: https://lkml.kernel.org/r/20231221065943.2803551-1-shy828301@gmail.com Signed-off-by: Yang Shi Reviewed-by: Yin Fengwei Cc: Christopher Lameter Cc: "Huang, Ying" Cc: Matthew Wilcox (Oracle) Cc: Rik van Riel Cc: kernel test robot Signed-off-by: Andrew Morton --- mm/mmap.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 282ed6d0914b07..66f534ec90a55e 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2051,7 +2051,6 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address) } } anon_vma_unlock_write(vma->anon_vma); - khugepaged_enter_vma(vma, vma->vm_flags); mas_destroy(&mas); validate_mm(mm); return error; @@ -2145,7 +2144,6 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address) } } anon_vma_unlock_write(vma->anon_vma); - khugepaged_enter_vma(vma, vma->vm_flags); mas_destroy(&mas); validate_mm(mm); return error; From 096a4624ec8e550103329db6f157c9bae9c0c9c0 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 11 Jan 2024 18:12:16 +0000 Subject: [PATCH 439/707] memcg: convert mem_cgroup_move_charge_pte_range() to use a folio Patch series "Convert memcontrol charge moving to use folios". No part of these patches should change behaviour; all the called functions already convert from page to folio, so this ought to simply be a reduction in the number of calls to compound_head(). This patch (of 4): Remove many calls to compound_head() by calling page_folio() once at the start of each stanza which receives a struct page from 'target'. There should be no change in behaviour here as all the called functions start out by converting the page to its folio. Link: https://lkml.kernel.org/r/20240111181219.3462852-1-willy@infradead.org Link: https://lkml.kernel.org/r/20240111181219.3462852-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Johannes Weiner Reviewed-by: Roman Gushchin Reviewed-by: Muchun Song Acked-by: Shakeel Butt Cc: Michal Hocko Signed-off-by: Andrew Morton --- mm/memcontrol.c | 49 ++++++++++++++++++++++++------------------------- 1 file changed, 24 insertions(+), 25 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d9ca0fdbe4ab04..516f811f740a20 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5966,23 +5966,22 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma, } /** - * mem_cgroup_move_account - move account of the page - * @page: the page + * mem_cgroup_move_account - move account of the folio + * @folio: The folio. * @compound: charge the page as compound or small page - * @from: mem_cgroup which the page is moved from. - * @to: mem_cgroup which the page is moved to. @from != @to. + * @from: mem_cgroup which the folio is moved from. + * @to: mem_cgroup which the folio is moved to. @from != @to. * - * The page must be locked and not on the LRU. + * The folio must be locked and not on the LRU. * * This function doesn't do "charge" to new cgroup and doesn't do "uncharge" * from old cgroup. */ -static int mem_cgroup_move_account(struct page *page, +static int mem_cgroup_move_account(struct folio *folio, bool compound, struct mem_cgroup *from, struct mem_cgroup *to) { - struct folio *folio = page_folio(page); struct lruvec *from_vec, *to_vec; struct pglist_data *pgdat; unsigned int nr_pages = compound ? folio_nr_pages(folio) : 1; @@ -6432,7 +6431,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, spinlock_t *ptl; enum mc_target_type target_type; union mc_target target; - struct page *page; + struct folio *folio; ptl = pmd_trans_huge_lock(pmd, vma); if (ptl) { @@ -6442,26 +6441,26 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, } target_type = get_mctgt_type_thp(vma, addr, *pmd, &target); if (target_type == MC_TARGET_PAGE) { - page = target.page; - if (isolate_lru_page(page)) { - if (!mem_cgroup_move_account(page, true, + folio = page_folio(target.page); + if (folio_isolate_lru(folio)) { + if (!mem_cgroup_move_account(folio, true, mc.from, mc.to)) { mc.precharge -= HPAGE_PMD_NR; mc.moved_charge += HPAGE_PMD_NR; } - putback_lru_page(page); + folio_putback_lru(folio); } - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); } else if (target_type == MC_TARGET_DEVICE) { - page = target.page; - if (!mem_cgroup_move_account(page, true, + folio = page_folio(target.page); + if (!mem_cgroup_move_account(folio, true, mc.from, mc.to)) { mc.precharge -= HPAGE_PMD_NR; mc.moved_charge += HPAGE_PMD_NR; } - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); } spin_unlock(ptl); return 0; @@ -6484,28 +6483,28 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, device = true; fallthrough; case MC_TARGET_PAGE: - page = target.page; + folio = page_folio(target.page); /* * We can have a part of the split pmd here. Moving it * can be done but it would be too convoluted so simply * ignore such a partial THP and keep it in original * memcg. There should be somebody mapping the head. */ - if (PageTransCompound(page)) + if (folio_test_large(folio)) goto put; - if (!device && !isolate_lru_page(page)) + if (!device && !folio_isolate_lru(folio)) goto put; - if (!mem_cgroup_move_account(page, false, + if (!mem_cgroup_move_account(folio, false, mc.from, mc.to)) { mc.precharge--; /* we uncharge from mc.from later. */ mc.moved_charge++; } if (!device) - putback_lru_page(page); + folio_putback_lru(folio); put: /* get_mctgt_type() gets & locks the page */ - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); break; case MC_TARGET_SWAP: ent = target.ent; From 0635b2765fbfd9789bd08389b189197c6b113baa Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 11 Jan 2024 18:12:17 +0000 Subject: [PATCH 440/707] memcg: return the folio in union mc_target All users of target.page convert it to the folio, so we can just return the folio directly and save a few calls to compound_head(). Link: https://lkml.kernel.org/r/20240111181219.3462852-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Johannes Weiner Reviewed-by: Roman Gushchin Reviewed-by: Muchun Song Acked-by: Shakeel Butt Cc: Michal Hocko Signed-off-by: Andrew Morton --- mm/memcontrol.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 516f811f740a20..d06066444244d3 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5874,7 +5874,7 @@ static int mem_cgroup_do_precharge(unsigned long count) } union mc_target { - struct page *page; + struct folio *folio; swp_entry_t ent; }; @@ -6096,7 +6096,7 @@ static int mem_cgroup_move_account(struct folio *folio, * Return: * * MC_TARGET_NONE - If the pte is not a target for move charge. * * MC_TARGET_PAGE - If the page corresponding to this pte is a target for - * move charge. If @target is not NULL, the page is stored in target->page + * move charge. If @target is not NULL, the folio is stored in target->folio * with extra refcnt taken (Caller should release it). * * MC_TARGET_SWAP - If the swap entry corresponding to this pte is a * target for charge migration. If @target is not NULL, the entry is @@ -6161,7 +6161,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, is_device_coherent_page(page)) ret = MC_TARGET_DEVICE; if (target) - target->page = page; + target->folio = page_folio(page); } if (!ret || !target) { if (target) @@ -6211,7 +6211,7 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, put_page(page); return MC_TARGET_NONE; } - target->page = page; + target->folio = page_folio(page); } } return ret; @@ -6441,7 +6441,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, } target_type = get_mctgt_type_thp(vma, addr, *pmd, &target); if (target_type == MC_TARGET_PAGE) { - folio = page_folio(target.page); + folio = target.folio; if (folio_isolate_lru(folio)) { if (!mem_cgroup_move_account(folio, true, mc.from, mc.to)) { @@ -6453,7 +6453,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, folio_unlock(folio); folio_put(folio); } else if (target_type == MC_TARGET_DEVICE) { - folio = page_folio(target.page); + folio = target.folio; if (!mem_cgroup_move_account(folio, true, mc.from, mc.to)) { mc.precharge -= HPAGE_PMD_NR; @@ -6483,7 +6483,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, device = true; fallthrough; case MC_TARGET_PAGE: - folio = page_folio(target.page); + folio = target.folio; /* * We can have a part of the split pmd here. Moving it * can be done but it would be too convoluted so simply From 7122ca3287bf6b90f97ba7b3a0950af5f82d1be3 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 11 Jan 2024 18:12:18 +0000 Subject: [PATCH 441/707] memcg: use a folio in get_mctgt_type Replace seven calls to compound_head() with one. We still use the page as page_mapped() is different from folio_mapped(). Link: https://lkml.kernel.org/r/20240111181219.3462852-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Johannes Weiner Reviewed-by: Roman Gushchin Reviewed-by: Muchun Song Acked-by: Shakeel Butt Cc: Michal Hocko Signed-off-by: Andrew Morton --- mm/memcontrol.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d06066444244d3..66aceb4ae2bac6 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6110,6 +6110,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, unsigned long addr, pte_t ptent, union mc_target *target) { struct page *page = NULL; + struct folio *folio; enum mc_target_type ret = MC_TARGET_NONE; swp_entry_t ent = { .val = 0 }; @@ -6124,9 +6125,11 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, else if (is_swap_pte(ptent)) page = mc_handle_swap_pte(vma, ptent, &ent); + if (page) + folio = page_folio(page); if (target && page) { - if (!trylock_page(page)) { - put_page(page); + if (!folio_trylock(folio)) { + folio_put(folio); return ret; } /* @@ -6141,8 +6144,8 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, * Alas, skip moving the page in this case. */ if (!pte_present(ptent) && page_mapped(page)) { - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); return ret; } } @@ -6155,18 +6158,18 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, * mem_cgroup_move_account() checks the page is valid or * not under LRU exclusion. */ - if (page_memcg(page) == mc.from) { + if (folio_memcg(folio) == mc.from) { ret = MC_TARGET_PAGE; - if (is_device_private_page(page) || - is_device_coherent_page(page)) + if (folio_is_device_private(folio) || + folio_is_device_coherent(folio)) ret = MC_TARGET_DEVICE; if (target) - target->folio = page_folio(page); + target->folio = folio; } if (!ret || !target) { if (target) - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); } } /* From 66f4e1e3f973f30af7ee40d05c77c87af2d20587 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 11 Jan 2024 18:12:19 +0000 Subject: [PATCH 442/707] memcg: use a folio in get_mctgt_type_thp Replace five calls to compound_head() with one. Link: https://lkml.kernel.org/r/20240111181219.3462852-5-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Johannes Weiner Reviewed-by: Roman Gushchin Reviewed-by: Muchun Song Acked-by: Shakeel Butt Cc: Michal Hocko Signed-off-by: Andrew Morton --- mm/memcontrol.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 66aceb4ae2bac6..6cc6b3a2a60c97 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6195,6 +6195,7 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, unsigned long addr, pmd_t pmd, union mc_target *target) { struct page *page = NULL; + struct folio *folio; enum mc_target_type ret = MC_TARGET_NONE; if (unlikely(is_swap_pmd(pmd))) { @@ -6204,17 +6205,18 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, } page = pmd_page(pmd); VM_BUG_ON_PAGE(!page || !PageHead(page), page); + folio = page_folio(page); if (!(mc.flags & MOVE_ANON)) return ret; - if (page_memcg(page) == mc.from) { + if (folio_memcg(folio) == mc.from) { ret = MC_TARGET_PAGE; if (target) { - get_page(page); - if (!trylock_page(page)) { - put_page(page); + folio_get(folio); + if (!folio_trylock(folio)) { + folio_put(folio); return MC_TARGET_NONE; } - target->folio = page_folio(page); + target->folio = folio; } } return ret; From 26f2f9d6d594d1b0c222b1d8792fe7c76b44ecad Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 11 Jan 2024 15:24:20 +0000 Subject: [PATCH 443/707] mm: add pfn_swap_entry_folio() Patch series "mm: convert mm counter to take a folio", v3. Make sure all mm_counter() and mm_counter_file() callers have a folio, then convert mm counter functions to take a folio, which saves some compound_head() calls. This patch (of 10): Thanks to the compound_head() hidden inside PageLocked(), this saves a call to compound_head() over calling page_folio(pfn_swap_entry_to_page()) Link: https://lkml.kernel.org/r/20240111152429.3374566-1-willy@infradead.org Link: https://lkml.kernel.org/r/20240111152429.3374566-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: David Hildenbrand Cc: Kefeng Wang Signed-off-by: Andrew Morton --- include/linux/swapops.h | 13 +++++++++++++ mm/filemap.c | 2 +- mm/huge_memory.c | 2 +- 3 files changed, 15 insertions(+), 2 deletions(-) diff --git a/include/linux/swapops.h b/include/linux/swapops.h index bff1e8d97de0e0..48b700ba1d188a 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -468,6 +468,19 @@ static inline struct page *pfn_swap_entry_to_page(swp_entry_t entry) return p; } +static inline struct folio *pfn_swap_entry_folio(swp_entry_t entry) +{ + struct folio *folio = pfn_folio(swp_offset_pfn(entry)); + + /* + * Any use of migration entries may only occur while the + * corresponding folio is locked + */ + BUG_ON(is_migration_entry(entry) && !folio_test_locked(folio)); + + return folio; +} + /* * A pfn swap entry is a special type of swap entry that always has a pfn stored * in the swap offset. They are used to represent unaddressable device memory diff --git a/mm/filemap.c b/mm/filemap.c index 0d7e20edf46f59..142864338ca4f2 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1354,7 +1354,7 @@ void migration_entry_wait_on_locked(swp_entry_t entry, spinlock_t *ptl) unsigned long pflags; bool in_thrashing; wait_queue_head_t *q; - struct folio *folio = page_folio(pfn_swap_entry_to_page(entry)); + struct folio *folio = pfn_swap_entry_folio(entry); q = folio_waitqueue(folio); if (!folio_test_uptodate(folio) && folio_test_workingset(folio)) { diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 94c958f7ebb50d..5468b2f97cbf70 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2045,7 +2045,7 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION if (is_swap_pmd(*pmd)) { swp_entry_t entry = pmd_to_swp_entry(*pmd); - struct folio *folio = page_folio(pfn_swap_entry_to_page(entry)); + struct folio *folio = pfn_swap_entry_folio(entry); pmd_t newpmd; VM_BUG_ON(!is_pmd_migration_entry(*pmd)); From fcb48a5ab5300b67337c7f40c47947b436c167ad Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 11 Jan 2024 15:24:21 +0000 Subject: [PATCH 444/707] proc: use pfn_swap_entry_folio where obvious These callers only pass the result to PageAnon(), so we can save the extra call to compound_head() by using pfn_swap_entry_folio(). Link: https://lkml.kernel.org/r/20240111152429.3374566-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: David Hildenbrand Cc: Kefeng Wang Signed-off-by: Andrew Morton --- fs/proc/task_mmu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 3f78ebbb795fe2..ac6ea2cc2ee8fe 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1807,7 +1807,7 @@ static unsigned long pagemap_page_category(struct pagemap_scan_private *p, if (p->masks_of_interest & PAGE_IS_FILE) { swp = pte_to_swp_entry(pte); if (is_pfn_swap_entry(swp) && - !PageAnon(pfn_swap_entry_to_page(swp))) + !folio_test_anon(pfn_swap_entry_folio(swp))) categories |= PAGE_IS_FILE; } if (pte_swp_soft_dirty(pte)) @@ -1873,7 +1873,7 @@ static unsigned long pagemap_thp_category(struct pagemap_scan_private *p, if (p->masks_of_interest & PAGE_IS_FILE) { swp = pmd_to_swp_entry(pmd); if (is_pfn_swap_entry(swp) && - !PageAnon(pfn_swap_entry_to_page(swp))) + !folio_test_anon(pfn_swap_entry_folio(swp))) categories |= PAGE_IS_FILE; } } From aebc55ef159e549fe3fa3c926c9a05cfd0783b28 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 11 Jan 2024 15:24:22 +0000 Subject: [PATCH 445/707] mprotect: use pfn_swap_entry_folio We only want to know whether the folio is anonymous, so use pfn_swap_entry_folio() and save a call to compound_head(). Link: https://lkml.kernel.org/r/20240111152429.3374566-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: David Hildenbrand Cc: Kefeng Wang Signed-off-by: Andrew Morton --- mm/mprotect.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/mprotect.c b/mm/mprotect.c index 81991102f7859e..f8a4544b4601db 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -198,13 +198,13 @@ static long change_pte_range(struct mmu_gather *tlb, pte_t newpte; if (is_writable_migration_entry(entry)) { - struct page *page = pfn_swap_entry_to_page(entry); + struct folio *folio = pfn_swap_entry_folio(entry); /* * A protection check is difficult so * just be safe and disable write */ - if (PageAnon(page)) + if (folio_test_anon(folio)) entry = make_readable_exclusive_migration_entry( swp_offset(entry)); else From e088a5658e9131b6f5931b4e7c41308e93840671 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Thu, 11 Jan 2024 15:24:23 +0000 Subject: [PATCH 446/707] s390: use pfn_swap_entry_folio() in ptep_zap_swap_entry() Call pfn_swap_entry_folio() in ptep_zap_swap_entry() as preparation for converting mm counter functions to take a folio. Link: https://lkml.kernel.org/r/20240111152429.3374566-5-willy@infradead.org Signed-off-by: Kefeng Wang Signed-off-by: Matthew Wilcox (Oracle) Cc: David Hildenbrand Signed-off-by: Andrew Morton --- arch/s390/mm/pgtable.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index 99422926efe1b5..7e5dd4b176642c 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -721,9 +721,9 @@ static void ptep_zap_swap_entry(struct mm_struct *mm, swp_entry_t entry) if (!non_swap_entry(entry)) dec_mm_counter(mm, MM_SWAPENTS); else if (is_migration_entry(entry)) { - struct page *page = pfn_swap_entry_to_page(entry); + struct folio *folio = pfn_swap_entry_folio(entry); - dec_mm_counter(mm, mm_counter(page)); + dec_mm_counter(mm, mm_counter(&folio->page)); } free_swap_and_cache(entry); } From 92473e04c1a9278e9cf74cdb079a892d52e39aa0 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Thu, 11 Jan 2024 15:24:24 +0000 Subject: [PATCH 447/707] mm: use pfn_swap_entry_folio() in __split_huge_pmd_locked() Call pfn_swap_entry_folio() in __split_huge_pmd_locked() as preparation for converting mm counter functions to take a folio. Link: https://lkml.kernel.org/r/20240111152429.3374566-6-willy@infradead.org Signed-off-by: Kefeng Wang Signed-off-by: Matthew Wilcox (Oracle) Cc: David Hildenbrand Signed-off-by: Andrew Morton --- mm/huge_memory.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 5468b2f97cbf70..33b720037ab725 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2442,7 +2442,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, swp_entry_t entry; entry = pmd_to_swp_entry(old_pmd); - page = pfn_swap_entry_to_page(entry); + folio = pfn_swap_entry_folio(entry); } else { page = pmd_page(old_pmd); folio = page_folio(page); @@ -2453,7 +2453,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, folio_remove_rmap_pmd(folio, page, vma); folio_put(folio); } - add_mm_counter(mm, mm_counter_file(page), -HPAGE_PMD_NR); + add_mm_counter(mm, mm_counter_file(&folio->page), -HPAGE_PMD_NR); return; } From 18f2fbe5cffb41513aaea760ad95f3c4f0b14ce7 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Thu, 11 Jan 2024 15:24:25 +0000 Subject: [PATCH 448/707] mm: use pfn_swap_entry_to_folio() in zap_huge_pmd() Call pfn_swap_entry_to_folio() in zap_huge_pmd() as preparation for converting mm counter functions to take a folio. Saves a call to compound_head() embedded inside PageAnon(). Link: https://lkml.kernel.org/r/20240111152429.3374566-7-willy@infradead.org Signed-off-by: Kefeng Wang Signed-off-by: Matthew Wilcox (Oracle) Cc: David Hildenbrand Signed-off-by: Andrew Morton --- mm/huge_memory.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 33b720037ab725..7a28a7db08ea0d 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1905,12 +1905,14 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, zap_deposited_table(tlb->mm, pmd); spin_unlock(ptl); } else { - struct page *page = NULL; + struct folio *folio = NULL; int flush_needed = 1; if (pmd_present(orig_pmd)) { - page = pmd_page(orig_pmd); - folio_remove_rmap_pmd(page_folio(page), page, vma); + struct page *page = pmd_page(orig_pmd); + + folio = page_folio(page); + folio_remove_rmap_pmd(folio, page, vma); VM_BUG_ON_PAGE(page_mapcount(page) < 0, page); VM_BUG_ON_PAGE(!PageHead(page), page); } else if (thp_migration_supported()) { @@ -1918,23 +1920,24 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, VM_BUG_ON(!is_pmd_migration_entry(orig_pmd)); entry = pmd_to_swp_entry(orig_pmd); - page = pfn_swap_entry_to_page(entry); + folio = pfn_swap_entry_folio(entry); flush_needed = 0; } else WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!"); - if (PageAnon(page)) { + if (folio_test_anon(folio)) { zap_deposited_table(tlb->mm, pmd); add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); } else { if (arch_needs_pgtable_deposit()) zap_deposited_table(tlb->mm, pmd); - add_mm_counter(tlb->mm, mm_counter_file(page), -HPAGE_PMD_NR); + add_mm_counter(tlb->mm, mm_counter_file(&folio->page), + -HPAGE_PMD_NR); } spin_unlock(ptl); if (flush_needed) - tlb_remove_page_size(tlb, page, HPAGE_PMD_SIZE); + tlb_remove_page_size(tlb, &folio->page, HPAGE_PMD_SIZE); } return 1; } From 948d3bb7dc9eb922cca81b0b454e9c9ec9ba7c4c Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Thu, 11 Jan 2024 15:24:26 +0000 Subject: [PATCH 449/707] mm: use pfn_swap_entry_folio() in copy_nonpresent_pte() Call pfn_swap_entry_folio() as preparation for converting mm counter functions to take a folio. Link: https://lkml.kernel.org/r/20240111152429.3374566-8-willy@infradead.org Signed-off-by: Kefeng Wang Signed-off-by: Matthew Wilcox (Oracle) Cc: David Hildenbrand Signed-off-by: Andrew Morton --- mm/memory.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 762458faec6969..3d28705b00616d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -806,9 +806,9 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, } rss[MM_SWAPENTS]++; } else if (is_migration_entry(entry)) { - page = pfn_swap_entry_to_page(entry); + folio = pfn_swap_entry_folio(entry); - rss[mm_counter(page)]++; + rss[mm_counter(&folio->page)]++; if (!is_readable_migration_entry(entry) && is_cow_mapping(vm_flags)) { From 9134bed5edf76d66a67741dfea662aacab55e7b7 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Thu, 11 Jan 2024 15:24:27 +0000 Subject: [PATCH 450/707] mm: convert to should_zap_page() to should_zap_folio() Make should_zap_page() take a folio and rename it to should_zap_folio() as preparation for converting mm counter functions to take a folio. Saves a call to compound_head() hidden inside PageAnon(). Link: https://lkml.kernel.org/r/20240111152429.3374566-9-willy@infradead.org Signed-off-by: Kefeng Wang Signed-off-by: Matthew Wilcox (Oracle) Cc: David Hildenbrand Signed-off-by: Andrew Morton --- mm/memory.c | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 3d28705b00616d..25455674c047bc 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1369,19 +1369,20 @@ static inline bool should_zap_cows(struct zap_details *details) return details->even_cows; } -/* Decides whether we should zap this page with the page pointer specified */ -static inline bool should_zap_page(struct zap_details *details, struct page *page) +/* Decides whether we should zap this folio with the folio pointer specified */ +static inline bool should_zap_folio(struct zap_details *details, + struct folio *folio) { - /* If we can make a decision without *page.. */ + /* If we can make a decision without *folio.. */ if (should_zap_cows(details)) return true; - /* E.g. the caller passes NULL for the case of a zero page */ - if (!page) + /* E.g. the caller passes NULL for the case of a zero folio */ + if (!folio) return true; - /* Otherwise we should only zap non-anon pages */ - return !PageAnon(page); + /* Otherwise we should only zap non-anon folios */ + return !folio_test_anon(folio); } static inline bool zap_drop_file_uffd_wp(struct zap_details *details) @@ -1447,7 +1448,10 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, unsigned int delay_rmap; page = vm_normal_page(vma, addr, ptent); - if (unlikely(!should_zap_page(details, page))) + if (page) + folio = page_folio(page); + + if (unlikely(!should_zap_folio(details, folio))) continue; ptent = ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm); @@ -1460,7 +1464,6 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, continue; } - folio = page_folio(page); delay_rmap = 0; if (!folio_test_anon(folio)) { if (pte_dirty(ptent)) { @@ -1492,7 +1495,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, is_device_exclusive_entry(entry)) { page = pfn_swap_entry_to_page(entry); folio = page_folio(page); - if (unlikely(!should_zap_page(details, page))) + if (unlikely(!should_zap_folio(details, folio))) continue; /* * Both device private/exclusive mappings should only @@ -1513,10 +1516,10 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, if (unlikely(!free_swap_and_cache(entry))) print_bad_pte(vma, addr, ptent, NULL); } else if (is_migration_entry(entry)) { - page = pfn_swap_entry_to_page(entry); - if (!should_zap_page(details, page)) + folio = pfn_swap_entry_folio(entry); + if (!should_zap_folio(details, folio)) continue; - rss[mm_counter(page)]--; + rss[mm_counter(&folio->page)]--; } else if (pte_marker_entry_uffd_wp(entry)) { /* * For anon: always drop the marker; for file: only From c221b4d007869f0da73e12b3685ba8c2434f586a Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 12 Jan 2024 18:14:32 +0800 Subject: [PATCH 451/707] mm-convert-to-should_zap_page-to-should_zap_folio-fix fix used-uninitialized warning Link: https://lkml.kernel.org/r/962a7993-fce9-4de8-85cd-25e290f25736@huawei.com Signed-off-by: Kefeng Wang Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202401121250.A221BL2D-lkp@intel.com/ Cc: David Hildenbrand Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memory.c b/mm/memory.c index 25455674c047bc..96eae99bb25c37 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1435,7 +1435,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, arch_enter_lazy_mmu_mode(); do { pte_t ptent = ptep_get(pte); - struct folio *folio; + struct folio *folio = NULL; struct page *page; if (pte_none(ptent)) From dbf44e93baa056ca5b60a25afa50329f5046138f Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Thu, 11 Jan 2024 15:24:28 +0000 Subject: [PATCH 452/707] mm: convert mm_counter() to take a folio Now all callers of mm_counter() have a folio, convert mm_counter() to take a folio. Saves a call to compound_head() hidden inside PageAnon(). Link: https://lkml.kernel.org/r/20240111152429.3374566-10-willy@infradead.org Signed-off-by: Kefeng Wang Signed-off-by: Matthew Wilcox (Oracle) Cc: David Hildenbrand Signed-off-by: Andrew Morton --- arch/s390/mm/pgtable.c | 2 +- include/linux/mm.h | 6 +++--- mm/memory.c | 10 +++++----- mm/rmap.c | 8 ++++---- mm/userfaultfd.c | 2 +- 5 files changed, 14 insertions(+), 14 deletions(-) diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index 7e5dd4b176642c..b71432b15d665c 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -723,7 +723,7 @@ static void ptep_zap_swap_entry(struct mm_struct *mm, swp_entry_t entry) else if (is_migration_entry(entry)) { struct folio *folio = pfn_swap_entry_folio(entry); - dec_mm_counter(mm, mm_counter(&folio->page)); + dec_mm_counter(mm, mm_counter(folio)); } free_swap_and_cache(entry); } diff --git a/include/linux/mm.h b/include/linux/mm.h index f5a97dec516948..22e597b36b3887 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2603,11 +2603,11 @@ static inline int mm_counter_file(struct page *page) return MM_FILEPAGES; } -static inline int mm_counter(struct page *page) +static inline int mm_counter(struct folio *folio) { - if (PageAnon(page)) + if (folio_test_anon(folio)) return MM_ANONPAGES; - return mm_counter_file(page); + return mm_counter_file(&folio->page); } static inline unsigned long get_mm_rss(struct mm_struct *mm) diff --git a/mm/memory.c b/mm/memory.c index 96eae99bb25c37..b04315a7ad2db2 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -808,7 +808,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, } else if (is_migration_entry(entry)) { folio = pfn_swap_entry_folio(entry); - rss[mm_counter(&folio->page)]++; + rss[mm_counter(folio)]++; if (!is_readable_migration_entry(entry) && is_cow_mapping(vm_flags)) { @@ -840,7 +840,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, * keep things as they are. */ folio_get(folio); - rss[mm_counter(page)]++; + rss[mm_counter(folio)]++; /* Cannot fail as these pages cannot get pinned. */ folio_try_dup_anon_rmap_pte(folio, page, src_vma); @@ -1476,7 +1476,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, if (pte_young(ptent) && likely(vma_has_recency(vma))) folio_mark_accessed(folio); } - rss[mm_counter(page)]--; + rss[mm_counter(folio)]--; if (!delay_rmap) { folio_remove_rmap_pte(folio, page, vma); if (unlikely(page_mapcount(page) < 0)) @@ -1504,7 +1504,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, * see zap_install_uffd_wp_if_needed(). */ WARN_ON_ONCE(!vma_is_anonymous(vma)); - rss[mm_counter(page)]--; + rss[mm_counter(folio)]--; if (is_device_private_entry(entry)) folio_remove_rmap_pte(folio, page, vma); folio_put(folio); @@ -1519,7 +1519,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, folio = pfn_swap_entry_folio(entry); if (!should_zap_folio(details, folio)) continue; - rss[mm_counter(&folio->page)]--; + rss[mm_counter(folio)]--; } else if (pte_marker_entry_uffd_wp(entry)) { /* * For anon: always drop the marker; for file: only diff --git a/mm/rmap.c b/mm/rmap.c index f5d43edad529a7..4648cf1d8178b5 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1780,7 +1780,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, set_huge_pte_at(mm, address, pvmw.pte, pteval, hsz); } else { - dec_mm_counter(mm, mm_counter(&folio->page)); + dec_mm_counter(mm, mm_counter(folio)); set_pte_at(mm, address, pvmw.pte, pteval); } @@ -1795,7 +1795,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, * migration) will not expect userfaults on already * copied pages. */ - dec_mm_counter(mm, mm_counter(&folio->page)); + dec_mm_counter(mm, mm_counter(folio)); } else if (folio_test_anon(folio)) { swp_entry_t entry = page_swap_entry(subpage); pte_t swp_pte; @@ -2181,7 +2181,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, set_huge_pte_at(mm, address, pvmw.pte, pteval, hsz); } else { - dec_mm_counter(mm, mm_counter(&folio->page)); + dec_mm_counter(mm, mm_counter(folio)); set_pte_at(mm, address, pvmw.pte, pteval); } @@ -2196,7 +2196,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, * migration) will not expect userfaults on already * copied pages. */ - dec_mm_counter(mm, mm_counter(&folio->page)); + dec_mm_counter(mm, mm_counter(folio)); } else { swp_entry_t entry; pte_t swp_pte; diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 7cf7d43842590c..ae80c37148290a 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -124,7 +124,7 @@ int mfill_atomic_install_pte(pmd_t *dst_pmd, * Must happen after rmap, as mm_counter() checks mapping (via * PageAnon()), which is set by __page_set_anon_rmap(). */ - inc_mm_counter(dst_mm, mm_counter(page)); + inc_mm_counter(dst_mm, mm_counter(folio)); set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); From f06d3e9d1d39d9b67d691902d0debf94c0dcd2c8 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Thu, 11 Jan 2024 15:24:29 +0000 Subject: [PATCH 453/707] mm: convert mm_counter_file() to take a folio Now all callers of mm_counter_file() have a folio, convert mm_counter_file() to take a folio. Saves a call to compound_head() hidden inside PageSwapBacked(). Link: https://lkml.kernel.org/r/20240111152429.3374566-11-willy@infradead.org Signed-off-by: Kefeng Wang Signed-off-by: Matthew Wilcox (Oracle) Cc: David Hildenbrand Signed-off-by: Andrew Morton --- include/linux/mm.h | 8 ++++---- kernel/events/uprobes.c | 2 +- mm/huge_memory.c | 4 ++-- mm/khugepaged.c | 4 ++-- mm/memory.c | 10 +++++----- mm/rmap.c | 2 +- 6 files changed, 15 insertions(+), 15 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 22e597b36b3887..ac6b71cbdffbfa 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2595,10 +2595,10 @@ static inline void dec_mm_counter(struct mm_struct *mm, int member) mm_trace_rss_stat(mm, member); } -/* Optimized variant when page is already known not to be PageAnon */ -static inline int mm_counter_file(struct page *page) +/* Optimized variant when folio is already known not to be anon */ +static inline int mm_counter_file(struct folio *folio) { - if (PageSwapBacked(page)) + if (folio_test_swapbacked(folio)) return MM_SHMEMPAGES; return MM_FILEPAGES; } @@ -2607,7 +2607,7 @@ static inline int mm_counter(struct folio *folio) { if (folio_test_anon(folio)) return MM_ANONPAGES; - return mm_counter_file(&folio->page); + return mm_counter_file(folio); } static inline unsigned long get_mm_rss(struct mm_struct *mm) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 929e98c629652a..e4834d23e1d1a2 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -188,7 +188,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, dec_mm_counter(mm, MM_ANONPAGES); if (!folio_test_anon(old_folio)) { - dec_mm_counter(mm, mm_counter_file(old_page)); + dec_mm_counter(mm, mm_counter_file(old_folio)); inc_mm_counter(mm, MM_ANONPAGES); } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 7a28a7db08ea0d..f005f04247355f 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1931,7 +1931,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, } else { if (arch_needs_pgtable_deposit()) zap_deposited_table(tlb->mm, pmd); - add_mm_counter(tlb->mm, mm_counter_file(&folio->page), + add_mm_counter(tlb->mm, mm_counter_file(folio), -HPAGE_PMD_NR); } @@ -2456,7 +2456,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, folio_remove_rmap_pmd(folio, page, vma); folio_put(folio); } - add_mm_counter(mm, mm_counter_file(&folio->page), -HPAGE_PMD_NR); + add_mm_counter(mm, mm_counter_file(folio), -HPAGE_PMD_NR); return; } diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 2b219acb528e25..fe43fbc4452539 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1634,7 +1634,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, /* step 3: set proper refcount and mm_counters. */ if (nr_ptes) { folio_ref_sub(folio, nr_ptes); - add_mm_counter(mm, mm_counter_file(&folio->page), -nr_ptes); + add_mm_counter(mm, mm_counter_file(folio), -nr_ptes); } /* step 4: remove empty page table */ @@ -1665,7 +1665,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, if (nr_ptes) { flush_tlb_mm(mm); folio_ref_sub(folio, nr_ptes); - add_mm_counter(mm, mm_counter_file(&folio->page), -nr_ptes); + add_mm_counter(mm, mm_counter_file(folio), -nr_ptes); } if (start_pte) pte_unmap_unlock(start_pte, ptl); diff --git a/mm/memory.c b/mm/memory.c index b04315a7ad2db2..f5750bfcdc058f 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -966,7 +966,7 @@ copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, } else if (page) { folio_get(folio); folio_dup_file_rmap_pte(folio, page); - rss[mm_counter_file(page)]++; + rss[mm_counter_file(folio)]++; } /* @@ -1873,7 +1873,7 @@ static int insert_page_into_pte_locked(struct vm_area_struct *vma, pte_t *pte, return -EBUSY; /* Ok, finally just insert the thing.. */ folio_get(folio); - inc_mm_counter(vma->vm_mm, mm_counter_file(page)); + inc_mm_counter(vma->vm_mm, mm_counter_file(folio)); folio_add_file_rmap_pte(folio, page, vma); set_pte_at(vma->vm_mm, addr, pte, mk_pte(page, prot)); return 0; @@ -3178,7 +3178,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) if (likely(vmf->pte && pte_same(ptep_get(vmf->pte), vmf->orig_pte))) { if (old_folio) { if (!folio_test_anon(old_folio)) { - dec_mm_counter(mm, mm_counter_file(&old_folio->page)); + dec_mm_counter(mm, mm_counter_file(old_folio)); inc_mm_counter(mm, MM_ANONPAGES); } } else { @@ -4463,7 +4463,7 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) if (write) entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); - add_mm_counter(vma->vm_mm, mm_counter_file(page), HPAGE_PMD_NR); + add_mm_counter(vma->vm_mm, mm_counter_file(folio), HPAGE_PMD_NR); folio_add_file_rmap_pmd(folio, page, vma); /* @@ -4526,7 +4526,7 @@ void set_pte_range(struct vm_fault *vmf, struct folio *folio, folio_add_new_anon_rmap(folio, vma, addr); folio_add_lru_vma(folio, vma); } else { - add_mm_counter(vma->vm_mm, mm_counter_file(page), nr); + add_mm_counter(vma->vm_mm, mm_counter_file(folio), nr); folio_add_file_rmap_ptes(folio, page, nr, vma); } set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr); diff --git a/mm/rmap.c b/mm/rmap.c index 4648cf1d8178b5..1cf2bffa48ed87 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1903,7 +1903,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, * * See Documentation/mm/mmu_notifier.rst */ - dec_mm_counter(mm, mm_counter_file(&folio->page)); + dec_mm_counter(mm, mm_counter_file(folio)); } discard: if (unlikely(folio_test_hugetlb(folio))) From c57c2645447bdc671c8f7c0810a39f1a77028420 Mon Sep 17 00:00:00 2001 From: Hui Zhu Date: Thu, 11 Jan 2024 08:45:33 +0000 Subject: [PATCH 454/707] fs/proc/task_mmu.c: add_to_pagemap: remove useless parameter addr Function parameter addr of add_to_pagemap() is useless. Remove it. Link: https://lkml.kernel.org/r/20240111084533.40038-1-teawaterz@linux.alibaba.com Signed-off-by: Hui Zhu Reviewed-by: Muhammad Usama Anjum Tested-by: Muhammad Usama Anjum Cc: Alexey Dobriyan Cc: Andrei Vagin Cc: David Hildenbrand Cc: Hugh Dickins Cc: Kefeng Wang Cc: Liam R. Howlett Cc: Peter Xu Cc: Ryan Roberts Signed-off-by: Andrew Morton --- fs/proc/task_mmu.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index ac6ea2cc2ee8fe..23fbab954c20b6 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1352,8 +1352,7 @@ static inline pagemap_entry_t make_pme(u64 frame, u64 flags) return (pagemap_entry_t) { .pme = (frame & PM_PFRAME_MASK) | flags }; } -static int add_to_pagemap(unsigned long addr, pagemap_entry_t *pme, - struct pagemapread *pm) +static int add_to_pagemap(pagemap_entry_t *pme, struct pagemapread *pm) { pm->buffer[pm->pos++] = *pme; if (pm->pos >= pm->len) @@ -1380,7 +1379,7 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end, hole_end = end; for (; addr < hole_end; addr += PAGE_SIZE) { - err = add_to_pagemap(addr, &pme, pm); + err = add_to_pagemap(&pme, pm); if (err) goto out; } @@ -1392,7 +1391,7 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end, if (vma->vm_flags & VM_SOFTDIRTY) pme = make_pme(0, PM_SOFT_DIRTY); for (; addr < min(end, vma->vm_end); addr += PAGE_SIZE) { - err = add_to_pagemap(addr, &pme, pm); + err = add_to_pagemap(&pme, pm); if (err) goto out; } @@ -1519,7 +1518,7 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, for (; addr != end; addr += PAGE_SIZE) { pagemap_entry_t pme = make_pme(frame, flags); - err = add_to_pagemap(addr, &pme, pm); + err = add_to_pagemap(&pme, pm); if (err) break; if (pm->show_pfn) { @@ -1547,7 +1546,7 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, pagemap_entry_t pme; pme = pte_to_pagemap_entry(pm, vma, addr, ptep_get(pte)); - err = add_to_pagemap(addr, &pme, pm); + err = add_to_pagemap(&pme, pm); if (err) break; } @@ -1597,7 +1596,7 @@ static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask, for (; addr != end; addr += PAGE_SIZE) { pagemap_entry_t pme = make_pme(frame, flags); - err = add_to_pagemap(addr, &pme, pm); + err = add_to_pagemap(&pme, pm); if (err) return err; if (pm->show_pfn && (flags & PM_PRESENT)) From 98158c18452b502a76900ddfaec31f68a54a82e3 Mon Sep 17 00:00:00 2001 From: Carlos Galo Date: Thu, 11 Jan 2024 21:05:30 +0000 Subject: [PATCH 455/707] mm: update mark_victim tracepoints fields The current implementation of the mark_victim tracepoint provides only the process ID (pid) of the victim process. This limitation poses challenges for userspace tools that need additional information about the OOM victim. The association between pid and the additional data may be lost after the kill, making it difficult for userspace to correlate the OOM event with the specific process. In order to mitigate this limitation, add the following fields: - UID In Android each installed application has a unique UID. Including the `uid` assists in correlating OOM events with specific apps. - Process Name (comm) Enables identification of the affected process. - OOM Score Allows userspace to get additional insights of the relative kill priority of the OOM victim. Link: https://lkml.kernel.org/r/20240111210539.636607-1-carlosgalo@google.com Signed-off-by: Carlos Galo Cc: Steven Rostedt Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- include/trace/events/oom.h | 19 +++++++++++++++---- mm/oom_kill.c | 6 +++++- 2 files changed, 20 insertions(+), 5 deletions(-) diff --git a/include/trace/events/oom.h b/include/trace/events/oom.h index 26a11e4a2c361d..3c5941da80755b 100644 --- a/include/trace/events/oom.h +++ b/include/trace/events/oom.h @@ -72,19 +72,30 @@ TRACE_EVENT(reclaim_retry_zone, ); TRACE_EVENT(mark_victim, - TP_PROTO(int pid), + TP_PROTO(struct task_struct *task, uid_t uid), - TP_ARGS(pid), + TP_ARGS(task, uid), TP_STRUCT__entry( __field(int, pid) + __field(uid_t, uid) + __string(comm, task->comm) + __field(short, oom_score_adj) ), TP_fast_assign( - __entry->pid = pid; + __entry->pid = task->pid; + __entry->uid = uid; + __assign_str(comm, task->comm); + __entry->oom_score_adj = task->signal->oom_score_adj; ), - TP_printk("pid=%d", __entry->pid) + TP_printk("pid=%d uid=%u comm=%s oom_score_adj=%hd", + __entry->pid, + __entry->uid, + __get_str(comm), + __entry->oom_score_adj + ) ); TRACE_EVENT(wake_reaper, diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 91ccd82097c2ba..8d6a207c3c5905 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -44,6 +44,7 @@ #include #include #include +#include #include #include "internal.h" @@ -754,6 +755,7 @@ static inline void queue_oom_reaper(struct task_struct *tsk) */ static void mark_oom_victim(struct task_struct *tsk) { + const struct cred *cred; struct mm_struct *mm = tsk->mm; WARN_ON(oom_killer_disabled); @@ -773,7 +775,9 @@ static void mark_oom_victim(struct task_struct *tsk) */ __thaw_task(tsk); atomic_inc(&oom_victims); - trace_mark_victim(tsk->pid); + cred = get_task_cred(tsk); + trace_mark_victim(tsk, cred->uid.val); + put_cred(cred); } /** From 7eda6ba9dc62ea00c611e77c61ec18796e1428e4 Mon Sep 17 00:00:00 2001 From: Pankaj Raghav Date: Mon, 15 Jan 2024 11:25:22 +0100 Subject: [PATCH 456/707] readahead: use ilog2 instead of a while loop in page_cache_ra_order() A while loop is used to adjust the new_order to be lower than the ra->size. ilog2 could be used to do the same instead of using a loop. ilog2 typically resolves to a bit scan reverse instruction. This is particularly useful when ra->size is smaller than the 2^new_order as it resolves in one instruction instead of looping to find the new_order. No functional changes. Link: https://lkml.kernel.org/r/20240115102523.2336742-1-kernel@pankajraghav.com Signed-off-by: Pankaj Raghav Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/readahead.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/mm/readahead.c b/mm/readahead.c index 2648ec4f04947b..1e74455f908e50 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -500,10 +500,8 @@ void page_cache_ra_order(struct readahead_control *ractl, if (new_order < MAX_PAGECACHE_ORDER) { new_order += 2; - if (new_order > MAX_PAGECACHE_ORDER) - new_order = MAX_PAGECACHE_ORDER; - while ((1 << new_order) > ra->size) - new_order--; + new_order = min_t(unsigned int, MAX_PAGECACHE_ORDER, new_order); + new_order = min_t(unsigned int, new_order, ilog2(ra->size)); } filemap_invalidate_lock_shared(mapping); From ca6196c1e3f77ea919406fe4295630e29c23f7d5 Mon Sep 17 00:00:00 2001 From: Nanyong Sun Date: Sat, 13 Jan 2024 17:44:34 +0800 Subject: [PATCH 457/707] mm: HVO: introduce helper function to update and flush pgtable Patch series "A Solution to Re-enable hugetlb vmemmap optimize", v3. HVO was previously disabled on arm64 [1] due to the lack of necessary BBM(break-before-make) logic when changing page tables. This set of patches fix this by adding necessary BBM sequence when changing page table, and supporting vmemmap page fault handling to fixup kernel address translation fault if vmemmap is concurrently accessed. I have tested this patch set with concurrently accessing the vmemmap address when do BBM and can recover by vmemmap fault handler. Also tested under the config of 2/3/4 pgtable levels with 4K/64K page size and all works well. This patch (of 3): Add pmd/pte update and tlb flush helper function to update page table. This refactoring patch is designed to facilitate each architecture to implement its own special logic in preparation for the arm64 architecture to follow the necessary break-before-make sequence when updating page tables. Link: https://lkml.kernel.org/r/20240113094436.2506396-1-sunnanyong@huawei.com Link: https://lkml.kernel.org/r/20240113094436.2506396-2-sunnanyong@huawei.com Signed-off-by: Nanyong Sun Reviewed-by: Muchun Song Cc: Anshuman Khandual Cc: Catalin Marinas Cc: Kefeng Wang Cc: Matthew Wilcox (Oracle) Cc: Mike Kravetz Cc: Will Deacon Signed-off-by: Andrew Morton --- mm/hugetlb_vmemmap.c | 55 ++++++++++++++++++++++++++++++++++---------- 1 file changed, 43 insertions(+), 12 deletions(-) diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index da177e49d95648..f1f5702bce4f6a 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -46,6 +46,37 @@ struct vmemmap_remap_walk { unsigned long flags; }; +#ifndef vmemmap_update_pmd +static inline void vmemmap_update_pmd(unsigned long addr, + pmd_t *pmdp, pte_t *ptep) +{ + pmd_populate_kernel(&init_mm, pmdp, ptep); +} +#endif + +#ifndef vmemmap_update_pte +static inline void vmemmap_update_pte(unsigned long addr, + pte_t *ptep, pte_t pte) +{ + set_pte_at(&init_mm, addr, ptep, pte); +} +#endif + +#ifndef vmemmap_flush_tlb_all +static inline void vmemmap_flush_tlb_all(void) +{ + flush_tlb_all(); +} +#endif + +#ifndef vmemmap_flush_tlb_range +static inline void vmemmap_flush_tlb_range(unsigned long start, + unsigned long end) +{ + flush_tlb_kernel_range(start, end); +} +#endif + static int vmemmap_split_pmd(pmd_t *pmd, struct page *head, unsigned long start, struct vmemmap_remap_walk *walk) { @@ -81,9 +112,9 @@ static int vmemmap_split_pmd(pmd_t *pmd, struct page *head, unsigned long start, /* Make pte visible before pmd. See comment in pmd_install(). */ smp_wmb(); - pmd_populate_kernel(&init_mm, pmd, pgtable); + vmemmap_update_pmd(start, pmd, pgtable); if (!(walk->flags & VMEMMAP_SPLIT_NO_TLB_FLUSH)) - flush_tlb_kernel_range(start, start + PMD_SIZE); + vmemmap_flush_tlb_range(start, start + PMD_SIZE); } else { pte_free_kernel(&init_mm, pgtable); } @@ -171,7 +202,7 @@ static int vmemmap_remap_range(unsigned long start, unsigned long end, return ret; if (walk->remap_pte && !(walk->flags & VMEMMAP_REMAP_NO_TLB_FLUSH)) - flush_tlb_kernel_range(start, end); + vmemmap_flush_tlb_range(start, end); return 0; } @@ -217,15 +248,15 @@ static void vmemmap_remap_pte(pte_t *pte, unsigned long addr, /* * Makes sure that preceding stores to the page contents from - * vmemmap_remap_free() become visible before the set_pte_at() - * write. + * vmemmap_remap_free() become visible before the + * vmemmap_update_pte() write. */ smp_wmb(); } entry = mk_pte(walk->reuse_page, pgprot); list_add(&page->lru, walk->vmemmap_pages); - set_pte_at(&init_mm, addr, pte, entry); + vmemmap_update_pte(addr, pte, entry); } /* @@ -264,10 +295,10 @@ static void vmemmap_restore_pte(pte_t *pte, unsigned long addr, /* * Makes sure that preceding stores to the page contents become visible - * before the set_pte_at() write. + * before the vmemmap_update_pte() write. */ smp_wmb(); - set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot)); + vmemmap_update_pte(addr, pte, mk_pte(page, pgprot)); } /** @@ -519,7 +550,7 @@ long hugetlb_vmemmap_restore_folios(const struct hstate *h, } if (restored) - flush_tlb_all(); + vmemmap_flush_tlb_all(); if (!ret) ret = restored; return ret; @@ -642,7 +673,7 @@ void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_l break; } - flush_tlb_all(); + vmemmap_flush_tlb_all(); list_for_each_entry(folio, folio_list, lru) { int ret; @@ -659,7 +690,7 @@ void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_l * allowing more vmemmap remaps to occur. */ if (ret == -ENOMEM && !list_empty(&vmemmap_pages)) { - flush_tlb_all(); + vmemmap_flush_tlb_all(); free_vmemmap_page_list(&vmemmap_pages); INIT_LIST_HEAD(&vmemmap_pages); __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, @@ -667,7 +698,7 @@ void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_l } } - flush_tlb_all(); + vmemmap_flush_tlb_all(); free_vmemmap_page_list(&vmemmap_pages); } From cb78d8ce2f43169251cc001f5c2a859295a1ecc4 Mon Sep 17 00:00:00 2001 From: Nanyong Sun Date: Sat, 13 Jan 2024 17:44:35 +0800 Subject: [PATCH 458/707] arm64: mm: HVO: support BBM of vmemmap pgtable safely Implement vmemmap_update_pmd and vmemmap_update_pte on arm64 to do BBM(break-before-make) logic when change the page table of vmemmap address, they will under the init_mm.page_table_lock. If a translation fault of vmemmap address concurrently happened after pte/pmd cleared, vmemmap page fault handler will acquire the init_mm.page_table_lock to wait for vmemmap update to complete, by then the virtual address is valid again, so PF can return and access can continue. In other case, do the traditional kernel fault. Implement vmemmap_flush_tlb_all/range on arm64 with nothing to do because tlb already flushed in every single BBM. Link: https://lkml.kernel.org/r/20240113094436.2506396-3-sunnanyong@huawei.com Signed-off-by: Nanyong Sun Cc: Anshuman Khandual Cc: Catalin Marinas Cc: Kefeng Wang Cc: Matthew Wilcox (Oracle) Cc: Mike Kravetz Cc: Muchun Song Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/arm64/include/asm/esr.h | 4 ++ arch/arm64/include/asm/pgtable.h | 8 ++++ arch/arm64/include/asm/tlbflush.h | 16 +++++++ arch/arm64/mm/fault.c | 78 +++++++++++++++++++++++++++++-- arch/arm64/mm/mmu.c | 28 +++++++++++ 5 files changed, 131 insertions(+), 3 deletions(-) diff --git a/arch/arm64/include/asm/esr.h b/arch/arm64/include/asm/esr.h index 353fe08546cf90..f13d1a094fd1ab 100644 --- a/arch/arm64/include/asm/esr.h +++ b/arch/arm64/include/asm/esr.h @@ -116,6 +116,10 @@ #define ESR_ELx_FSC_SERROR (0x11) #define ESR_ELx_FSC_ACCESS (0x08) #define ESR_ELx_FSC_FAULT (0x04) +#define ESR_ELx_FSC_FAULT_L0 (0x04) +#define ESR_ELx_FSC_FAULT_L1 (0x05) +#define ESR_ELx_FSC_FAULT_L2 (0x06) +#define ESR_ELx_FSC_FAULT_L3 (0x07) #define ESR_ELx_FSC_PERM (0x0C) #define ESR_ELx_FSC_SEA_TTW0 (0x14) #define ESR_ELx_FSC_SEA_TTW1 (0x15) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 79ce70fbb751c6..b50270107e2f02 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -1124,6 +1124,14 @@ extern pte_t ptep_modify_prot_start(struct vm_area_struct *vma, extern void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t old_pte, pte_t new_pte); + +#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP +void vmemmap_update_pmd(unsigned long addr, pmd_t *pmdp, pte_t *ptep); +#define vmemmap_update_pmd vmemmap_update_pmd +void vmemmap_update_pte(unsigned long addr, pte_t *ptep, pte_t pte); +#define vmemmap_update_pte vmemmap_update_pte +#endif + #endif /* !__ASSEMBLY__ */ #endif /* __ASM_PGTABLE_H */ diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h index 1deb5d789c2e23..79e932a1bdf87f 100644 --- a/arch/arm64/include/asm/tlbflush.h +++ b/arch/arm64/include/asm/tlbflush.h @@ -504,6 +504,22 @@ static inline void __flush_tlb_kernel_pgtable(unsigned long kaddr) dsb(ish); isb(); } + +#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP +static inline void vmemmap_flush_tlb_all(void) +{ + /* do nothing, already flushed tlb in every single BBM */ +} +#define vmemmap_flush_tlb_all vmemmap_flush_tlb_all + +static inline void vmemmap_flush_tlb_range(unsigned long start, + unsigned long end) +{ + /* do nothing, already flushed tlb in every single BBM */ +} +#define vmemmap_flush_tlb_range vmemmap_flush_tlb_range +#endif + #endif #endif diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 55f6455a828434..13189322a38ff0 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -368,6 +368,75 @@ static bool is_el1_mte_sync_tag_check_fault(unsigned long esr) return false; } +#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP +static inline bool vmemmap_fault_may_fixup(unsigned long addr, + unsigned long esr) +{ + if (addr < VMEMMAP_START || addr >= VMEMMAP_END) + return false; + + /* + * Only try to handle translation fault level 2 or level 3, + * because hugetlb vmemmap optimize only clear pmd or pte. + */ + switch (esr & ESR_ELx_FSC) { + case ESR_ELx_FSC_FAULT_L2: + case ESR_ELx_FSC_FAULT_L3: + return true; + default: + return false; + } +} + +/* + * PMD mapped vmemmap should has been split as PTE mapped + * by HVO now, here we only check this case, other cases + * should fail. + * Also should check the addr is healthy enough that will not cause + * a level2 or level3 translation fault again after page fault + * handled with success, so we need check both bits[1:0] of PMD and + * PTE as ARM Spec mentioned below: + * A Translation fault is generated if bits[1:0] of a translation + * table descriptor identify the descriptor as either a Fault + * encoding or a reserved encoding. + */ +static inline bool vmemmap_addr_healthy(unsigned long addr) +{ + pmd_t *pmdp, pmd; + pte_t *ptep, pte; + + pmdp = pmd_off_k(addr); + pmd = pmdp_get(pmdp); + if (!pmd_table(pmd)) + return false; + + ptep = pte_offset_kernel(pmdp, addr); + pte = ptep_get(ptep); + return (pte_val(pte) & PTE_TYPE_MASK) == PTE_TYPE_PAGE; +} + +static bool vmemmap_handle_page_fault(unsigned long addr, + unsigned long esr) +{ + bool ret; + + if (likely(!vmemmap_fault_may_fixup(addr, esr))) + return false; + + spin_lock(&init_mm.page_table_lock); + ret = vmemmap_addr_healthy(addr); + spin_unlock(&init_mm.page_table_lock); + + return ret; +} +#else +static inline bool vmemmap_handle_page_fault(unsigned long addr, + unsigned long esr) +{ + return false; +} +#endif /* CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP */ + static bool is_translation_fault(unsigned long esr) { return (esr & ESR_ELx_FSC_TYPE) == ESR_ELx_FSC_FAULT; @@ -405,9 +474,12 @@ static void __do_kernel_fault(unsigned long addr, unsigned long esr, } else if (addr < PAGE_SIZE) { msg = "NULL pointer dereference"; } else { - if (is_translation_fault(esr) && - kfence_handle_page_fault(addr, esr & ESR_ELx_WNR, regs)) - return; + if (is_translation_fault(esr)) { + if (kfence_handle_page_fault(addr, esr & ESR_ELx_WNR, regs)) + return; + if (vmemmap_handle_page_fault(addr, esr)) + return; + } msg = "paging request"; } diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 1ac7467d34c9c3..d794b2f4b5a3cd 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -1146,6 +1146,34 @@ int __meminit vmemmap_check_pmd(pmd_t *pmdp, int node, return 1; } +#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP +/* + * In the window between the page table entry is cleared and filled + * with a new value, other threads have the opportunity to concurrently + * access the vmemmap area then page translation fault occur. + * Therefore, we need to ensure that the init_mm.page_table_lock is held + * to synchronize the vmemmap page fault handling which will wait for + * this lock to be released to ensure that the page table entry has been + * refreshed with a new valid value. + */ +void vmemmap_update_pmd(unsigned long addr, pmd_t *pmdp, pte_t *ptep) +{ + lockdep_assert_held(&init_mm.page_table_lock); + pmd_clear(pmdp); + flush_tlb_kernel_range(addr, addr + PMD_SIZE); + pmd_populate_kernel(&init_mm, pmdp, ptep); +} + +void vmemmap_update_pte(unsigned long addr, pte_t *ptep, pte_t pte) +{ + spin_lock(&init_mm.page_table_lock); + pte_clear(&init_mm, addr, ptep); + flush_tlb_kernel_range(addr, addr + PAGE_SIZE); + set_pte_at(&init_mm, addr, ptep, pte); + spin_unlock(&init_mm.page_table_lock); +} +#endif + int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, struct vmem_altmap *altmap) { From a3045e96974aa75213f24576697ada761b5df489 Mon Sep 17 00:00:00 2001 From: Nanyong Sun Date: Sat, 13 Jan 2024 17:44:36 +0800 Subject: [PATCH 459/707] arm64: mm: re-enable OPTIMIZE_HUGETLB_VMEMMAP Now update of vmemmap page table can follow the rule of break-before-make safely for arm64 architecture, re-enable HVO on arm64. Link: https://lkml.kernel.org/r/20240113094436.2506396-4-sunnanyong@huawei.com Signed-off-by: Nanyong Sun Reviewed-by: Muchun Song Cc: Anshuman Khandual Cc: Catalin Marinas Cc: Kefeng Wang Cc: Matthew Wilcox (Oracle) Cc: Mike Kravetz Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/arm64/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index aa7c1d43513968..ad5dbaf3dc9f57 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -104,6 +104,7 @@ config ARM64 select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT select ARCH_WANT_FRAME_POINTERS select ARCH_WANT_HUGE_PMD_SHARE if ARM64_4K_PAGES || (ARM64_16K_PAGES && !ARM64_VA_BITS_36) + select ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP select ARCH_WANT_LD_ORPHAN_WARN select ARCH_WANTS_NO_INSTR select ARCH_WANTS_THP_SWAP if ARM64_4K_PAGES From 209826bbad1234aa8f683e73ab85ac5710399903 Mon Sep 17 00:00:00 2001 From: Ronald Monthero Date: Tue, 16 Jan 2024 23:31:45 +1000 Subject: [PATCH 460/707] mm/zswap: improve with alloc_workqueue() call The core-api create_workqueue is deprecated, this patch replaces the create_workqueue with alloc_workqueue. The previous implementation workqueue of zswap was a bounded workqueue, this patch uses alloc_workqueue() to create an unbounded workqueue. The WQ_UNBOUND attribute is desirable making the workqueue not localized to a specific cpu so that the scheduler is free to exercise improvisations in any demanding scenarios for offloading cpu time slices for workqueues. For example if any other workqueues of the same primary cpu had to be served which are WQ_HIGHPRI and WQ_CPU_INTENSIVE. Also Unbound workqueue happens to be more efficient in a system during memory pressure scenarios in comparison to a bounded workqueue. shrink_wq = alloc_workqueue("zswap-shrink", WQ_UNBOUND|WQ_MEM_RECLAIM, 1); Overall the change suggested in this patch should be seamless and does not alter the existing behavior, other than the improvisation to be an unbounded workqueue. Link: https://lkml.kernel.org/r/20240116133145.12454-1-debug.penguin32@gmail.com Signed-off-by: Ronald Monthero Cc: Chris Li Cc: Dan Streetman Cc: Nhat Pham Cc: Seth Jennings Cc: Vitaly Wool Signed-off-by: Andrew Morton --- mm/zswap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/zswap.c b/mm/zswap.c index d2423247acfd64..e7b38aefb9afe7 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1884,7 +1884,8 @@ static int zswap_setup(void) zswap_enabled = false; } - shrink_wq = create_workqueue("zswap-shrink"); + shrink_wq = alloc_workqueue("zswap-shrink", + WQ_UNBOUND|WQ_MEM_RECLAIM, 1); if (!shrink_wq) goto fallback_fail; From 2282c541aefdb0a425b928e1fb8b7c1fe410f722 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Tue, 16 Jan 2024 14:12:35 +0000 Subject: [PATCH 461/707] tools/mm: add thpmaps script to dump THP usage info With the proliferation of large folios for file-backed memory, and more recently the introduction of multi-size THP for anonymous memory, it is becoming useful to be able to see exactly how large folios are mapped into processes. For some architectures (e.g. arm64), if most memory is mapped using contpte-sized and -aligned blocks, TLB usage can be optimized so it's useful to see where these requirements are and are not being met. thpmaps is a Python utility that reads /proc//smaps, /proc//pagemap and /proc/kpageflags to print information about how transparent huge pages (both file and anon) are mapped to a specified process or cgroup. It aims to help users debug and optimize their workloads. In future we may wish to introduce stats directly into the kernel (e.g. smaps or similar), but for now this provides a short term solution without the need to introduce any new ABI. Run with help option for a full listing of the arguments: # ./thpmaps --help --8<-- usage: thpmaps [-h] [--pid pid | --cgroup path] [--rollup] [--cont size[KMG]] [--inc-smaps] [--inc-empty] [--periodic sleep_ms] Prints information about how transparent huge pages are mapped, either system-wide, or for a specified process or cgroup. When run with --pid, the user explicitly specifies the set of pids to scan. e.g. "--pid 10 [--pid 134 ...]". When run with --cgroup, the user passes either a v1 or v2 cgroup and all pids that belong to the cgroup subtree are scanned. When run with neither --pid nor --cgroup, the full set of pids on the system is gathered from /proc and scanned as if the user had provided "--pid 1 --pid 2 ...". A default set of statistics is always generated for THP mappings. However, it is also possible to generate additional statistics for "contiguous block mappings" where the block size is user-defined. Statistics are maintained independently for anonymous and file-backed (pagecache) memory and are shown both in kB and as a percentage of either total anonymous or total file-backed memory as appropriate. THP Statistics -------------- Statistics are always generated for fully- and contiguously-mapped THPs whose mapping address is aligned to their size, for each supported by the system. Separate counters describe THPs mapped by PTE vs those mapped by PMD. (Although note a THP can only be mapped by PMD if it is PMD-sized): - anon-thp-pte-aligned-kB - file-thp-pte-aligned-kB - anon-thp-pmd-aligned-kB - file-thp-pmd-aligned-kB Similarly, statistics are always generated for fully- and contiguously- mapped THPs whose mapping address is *not* aligned to their size, for each supported by the system. Due to the unaligned mapping, it is impossible to map by PMD, so there are only PTE counters for this case: - anon-thp-pte-unaligned-kB - file-thp-pte-unaligned-kB Statistics are also always generated for mapped pages that belong to a THP but where the is THP is *not* fully- and contiguously- mapped. These "partial" mappings are all counted in the same counter regardless of the size of the THP that is partially mapped: - anon-thp-pte-partial - file-thp-pte-partial Contiguous Block Statistics --------------------------- An optional, additional set of statistics is generated for every contiguous block size specified with `--cont `. These statistics show how much memory is mapped in contiguous blocks of and also aligned to . A given contiguous block must all belong to the same THP, but there is no requirement for it to be the *whole* THP. Separate counters describe contiguous blocks mapped by PTE vs those mapped by PMD: - anon-cont-pte-aligned-kB - file-cont-pte-aligned-kB - anon-cont-pmd-aligned-kB - file-cont-pmd-aligned-kB As an example, if monitoring 64K contiguous blocks (--cont 64K), there are a number of sources that could provide such blocks: a fully- and contiguously-mapped 64K THP that is aligned to a 64K boundary would provide 1 block. A fully- and contiguously-mapped 128K THP that is aligned to at least a 64K boundary would provide 2 blocks. Or a 128K THP that maps its first 100K, but contiguously and starting at a 64K boundary would provide 1 block. A fully- and contiguously-mapped 2M THP would provide 32 blocks. There are many other possible permutations. options: -h, --help show this help message and exit --pid pid Process id of the target process. Maybe issued multiple times to scan multiple processes. --pid and --cgroup are mutually exclusive. If neither are provided, all processes are scanned to provide system-wide information. --cgroup path Path to the target cgroup in sysfs. Iterates over every pid in the cgroup and its children. --pid and --cgroup are mutually exclusive. If neither are provided, all processes are scanned to provide system-wide information. --rollup Sum the per-vma statistics to provide a summary over the whole system, process or cgroup. --cont size[KMG] Adds stats for memory that is mapped in contiguous blocks of and also aligned to . May be issued multiple times to track multiple sized blocks. Useful to infer e.g. arm64 contpte and hpa mappings. Size must be a power-of-2 number of pages. --inc-smaps Include all numerical, additive /proc//smaps stats in the output. --inc-empty Show all statistics including those whose value is 0. --periodic sleep_ms Run in a loop, polling every sleep_ms milliseconds. Requires root privilege to access pagemap and kpageflags. --8<-- Example command to summarise fully and partially mapped THPs and 64K contiguous blocks over all VMAs in all processes in the system (--inc-empty forces printing stats that are 0): # ./thpmaps --cont 64K --rollup --inc-empty --8<-- anon-thp-pmd-aligned-2048kB: 139264 kB ( 6%) file-thp-pmd-aligned-2048kB: 0 kB ( 0%) anon-thp-pte-aligned-16kB: 0 kB ( 0%) anon-thp-pte-aligned-32kB: 0 kB ( 0%) anon-thp-pte-aligned-64kB: 72256 kB ( 3%) anon-thp-pte-aligned-128kB: 0 kB ( 0%) anon-thp-pte-aligned-256kB: 0 kB ( 0%) anon-thp-pte-aligned-512kB: 0 kB ( 0%) anon-thp-pte-aligned-1024kB: 0 kB ( 0%) anon-thp-pte-aligned-2048kB: 0 kB ( 0%) anon-thp-pte-unaligned-16kB: 0 kB ( 0%) anon-thp-pte-unaligned-32kB: 0 kB ( 0%) anon-thp-pte-unaligned-64kB: 0 kB ( 0%) anon-thp-pte-unaligned-128kB: 0 kB ( 0%) anon-thp-pte-unaligned-256kB: 0 kB ( 0%) anon-thp-pte-unaligned-512kB: 0 kB ( 0%) anon-thp-pte-unaligned-1024kB: 0 kB ( 0%) anon-thp-pte-unaligned-2048kB: 0 kB ( 0%) anon-thp-pte-partial: 63232 kB ( 3%) file-thp-pte-aligned-16kB: 809024 kB (47%) file-thp-pte-aligned-32kB: 43168 kB ( 3%) file-thp-pte-aligned-64kB: 98496 kB ( 6%) file-thp-pte-aligned-128kB: 17536 kB ( 1%) file-thp-pte-aligned-256kB: 0 kB ( 0%) file-thp-pte-aligned-512kB: 0 kB ( 0%) file-thp-pte-aligned-1024kB: 0 kB ( 0%) file-thp-pte-aligned-2048kB: 0 kB ( 0%) file-thp-pte-unaligned-16kB: 21712 kB ( 1%) file-thp-pte-unaligned-32kB: 704 kB ( 0%) file-thp-pte-unaligned-64kB: 896 kB ( 0%) file-thp-pte-unaligned-128kB: 44928 kB ( 3%) file-thp-pte-unaligned-256kB: 0 kB ( 0%) file-thp-pte-unaligned-512kB: 0 kB ( 0%) file-thp-pte-unaligned-1024kB: 0 kB ( 0%) file-thp-pte-unaligned-2048kB: 0 kB ( 0%) file-thp-pte-partial: 9252 kB ( 1%) anon-cont-pmd-aligned-64kB: 139264 kB ( 6%) file-cont-pmd-aligned-64kB: 0 kB ( 0%) anon-cont-pte-aligned-64kB: 100672 kB ( 4%) file-cont-pte-aligned-64kB: 161856 kB ( 9%) --8<-- Link: https://lkml.kernel.org/r/20240116141235.960842-1-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Tested-by: Barry Song Cc: Alistair Popple Cc: David Hildenbrand Cc: John Hubbard Cc: Kefeng Wang Cc: Matthew Wilcox (Oracle) Cc: William Kucharski Cc: Zenghui Yu Cc: Zi Yan Signed-off-by: Andrew Morton --- tools/mm/Makefile | 9 +- tools/mm/thpmaps | 675 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 680 insertions(+), 4 deletions(-) create mode 100644 tools/mm/thpmaps diff --git a/tools/mm/Makefile b/tools/mm/Makefile index 1c5606cc33346b..7bb03606b9eaa2 100644 --- a/tools/mm/Makefile +++ b/tools/mm/Makefile @@ -3,7 +3,8 @@ # include ../scripts/Makefile.include -TARGETS=page-types slabinfo page_owner_sort +BUILD_TARGETS=page-types slabinfo page_owner_sort +INSTALL_TARGETS = $(BUILD_TARGETS) thpmaps LIB_DIR = ../lib/api LIBS = $(LIB_DIR)/libapi.a @@ -11,9 +12,9 @@ LIBS = $(LIB_DIR)/libapi.a CFLAGS += -Wall -Wextra -I../lib/ -pthread LDFLAGS += $(LIBS) -pthread -all: $(TARGETS) +all: $(BUILD_TARGETS) -$(TARGETS): $(LIBS) +$(BUILD_TARGETS): $(LIBS) $(LIBS): make -C $(LIB_DIR) @@ -29,4 +30,4 @@ sbindir ?= /usr/sbin install: all install -d $(DESTDIR)$(sbindir) - install -m 755 -p $(TARGETS) $(DESTDIR)$(sbindir) + install -m 755 -p $(INSTALL_TARGETS) $(DESTDIR)$(sbindir) diff --git a/tools/mm/thpmaps b/tools/mm/thpmaps new file mode 100644 index 00000000000000..803e0318f2fea1 --- /dev/null +++ b/tools/mm/thpmaps @@ -0,0 +1,675 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0-only +# Copyright (C) 2024 ARM Ltd. +# +# Utility providing smaps-like output detailing transparent hugepage usage. +# For more info, run: +# ./thpmaps --help +# +# Requires numpy: +# pip3 install numpy + + +import argparse +import collections +import math +import os +import re +import resource +import shutil +import sys +import textwrap +import time +import numpy as np + + +with open('/sys/kernel/mm/transparent_hugepage/hpage_pmd_size') as f: + PAGE_SIZE = resource.getpagesize() + PAGE_SHIFT = int(math.log2(PAGE_SIZE)) + PMD_SIZE = int(f.read()) + PMD_ORDER = int(math.log2(PMD_SIZE / PAGE_SIZE)) + + +def align_forward(v, a): + return (v + (a - 1)) & ~(a - 1) + + +def align_offset(v, a): + return v & (a - 1) + + +def kbnr(kb): + # Convert KB to number of pages. + return (kb << 10) >> PAGE_SHIFT + + +def nrkb(nr): + # Convert number of pages to KB. + return (nr << PAGE_SHIFT) >> 10 + + +def odkb(order): + # Convert page order to KB. + return (PAGE_SIZE << order) >> 10 + + +def cont_ranges_all(search, index): + # Given a list of arrays, find the ranges for which values are monotonically + # incrementing in all arrays. all arrays in search and index must be the + # same size. + sz = len(search[0]) + r = np.full(sz, 2) + d = np.diff(search[0]) == 1 + for dd in [np.diff(arr) == 1 for arr in search[1:]]: + d &= dd + r[1:] -= d + r[:-1] -= d + return [np.repeat(arr, r).reshape(-1, 2) for arr in index] + + +class ArgException(Exception): + pass + + +class FileIOException(Exception): + pass + + +class BinArrayFile: + # Base class used to read /proc//pagemap and /proc/kpageflags into a + # numpy array. Use inherrited class in a with clause to ensure file is + # closed when it goes out of scope. + def __init__(self, filename, element_size): + self.element_size = element_size + self.filename = filename + self.fd = os.open(self.filename, os.O_RDONLY) + + def cleanup(self): + os.close(self.fd) + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.cleanup() + + def _readin(self, offset, buffer): + length = os.preadv(self.fd, (buffer,), offset) + if len(buffer) != length: + raise FileIOException('error: {} failed to read {} bytes at {:x}' + .format(self.filename, len(buffer), offset)) + + def _toarray(self, buf): + assert(self.element_size == 8) + return np.frombuffer(buf, dtype=np.uint64) + + def getv(self, vec): + vec *= self.element_size + offsets = vec[:, 0] + lengths = (np.diff(vec) + self.element_size).reshape(len(vec)) + buf = bytearray(int(np.sum(lengths))) + view = memoryview(buf) + pos = 0 + for offset, length in zip(offsets, lengths): + offset = int(offset) + length = int(length) + self._readin(offset, view[pos:pos+length]) + pos += length + return self._toarray(buf) + + def get(self, index, nr=1): + offset = index * self.element_size + length = nr * self.element_size + buf = bytearray(length) + self._readin(offset, buf) + return self._toarray(buf) + + +PM_PAGE_PRESENT = 1 << 63 +PM_PFN_MASK = (1 << 55) - 1 + +class PageMap(BinArrayFile): + # Read ranges of a given pid's pagemap into a numpy array. + def __init__(self, pid='self'): + super().__init__(f'/proc/{pid}/pagemap', 8) + + +KPF_ANON = 1 << 12 +KPF_COMPOUND_HEAD = 1 << 15 +KPF_COMPOUND_TAIL = 1 << 16 +KPF_THP = 1 << 22 + +class KPageFlags(BinArrayFile): + # Read ranges of /proc/kpageflags into a numpy array. + def __init__(self): + super().__init__(f'/proc/kpageflags', 8) + + +vma_all_stats = set([ + "Size", + "Rss", + "Pss", + "Pss_Dirty", + "Shared_Clean", + "Shared_Dirty", + "Private_Clean", + "Private_Dirty", + "Referenced", + "Anonymous", + "KSM", + "LazyFree", + "AnonHugePages", + "ShmemPmdMapped", + "FilePmdMapped", + "Shared_Hugetlb", + "Private_Hugetlb", + "Swap", + "SwapPss", + "Locked", +]) + +vma_min_stats = set([ + "Rss", + "Anonymous", + "AnonHugePages", + "ShmemPmdMapped", + "FilePmdMapped", +]) + +VMA = collections.namedtuple('VMA', [ + 'name', + 'start', + 'end', + 'read', + 'write', + 'execute', + 'private', + 'pgoff', + 'major', + 'minor', + 'inode', + 'stats', +]) + +class VMAList: + # A container for VMAs, parsed from /proc//smaps. Iterate over the + # instance to receive VMAs. + def __init__(self, pid='self', stats=[]): + self.vmas = [] + with open(f'/proc/{pid}/smaps', 'r') as file: + for line in file: + elements = line.split() + if '-' in elements[0]: + start, end = map(lambda x: int(x, 16), elements[0].split('-')) + major, minor = map(lambda x: int(x, 16), elements[3].split(':')) + self.vmas.append(VMA( + name=elements[5] if len(elements) == 6 else '', + start=start, + end=end, + read=elements[1][0] == 'r', + write=elements[1][1] == 'w', + execute=elements[1][2] == 'x', + private=elements[1][3] == 'p', + pgoff=int(elements[2], 16), + major=major, + minor=minor, + inode=int(elements[4], 16), + stats={}, + )) + else: + param = elements[0][:-1] + if param in stats: + value = int(elements[1]) + self.vmas[-1].stats[param] = {'type': None, 'value': value} + + def __iter__(self): + yield from self.vmas + + +def thp_parse(vma, kpageflags, ranges, indexes, vfns, pfns, anons, heads): + # Given 4 same-sized arrays representing a range within a page table backed + # by THPs (vfns: virtual frame numbers, pfns: physical frame numbers, anons: + # True if page is anonymous, heads: True if page is head of a THP), return a + # dictionary of statistics describing the mapped THPs. + stats = { + 'file': { + 'partial': 0, + 'aligned': [0] * (PMD_ORDER + 1), + 'unaligned': [0] * (PMD_ORDER + 1), + }, + 'anon': { + 'partial': 0, + 'aligned': [0] * (PMD_ORDER + 1), + 'unaligned': [0] * (PMD_ORDER + 1), + }, + } + + for rindex, rpfn in zip(ranges[0], ranges[2]): + index_next = int(rindex[0]) + index_end = int(rindex[1]) + 1 + pfn_end = int(rpfn[1]) + 1 + + folios = indexes[index_next:index_end][heads[index_next:index_end]] + + # Account pages for any partially mapped THP at the front. In that case, + # the first page of the range is a tail. + nr = (int(folios[0]) if len(folios) else index_end) - index_next + stats['anon' if anons[index_next] else 'file']['partial'] += nr + + # Account pages for any partially mapped THP at the back. In that case, + # the next page after the range is a tail. + if len(folios): + flags = int(kpageflags.get(pfn_end)[0]) + if flags & KPF_COMPOUND_TAIL: + nr = index_end - int(folios[-1]) + folios = folios[:-1] + index_end -= nr + stats['anon' if anons[index_end - 1] else 'file']['partial'] += nr + + # Account fully mapped THPs in the middle of the range. + if len(folios): + folio_nrs = np.append(np.diff(folios), np.uint64(index_end - folios[-1])) + folio_orders = np.log2(folio_nrs).astype(np.uint64) + for index, order in zip(folios, folio_orders): + index = int(index) + order = int(order) + nr = 1 << order + vfn = int(vfns[index]) + align = 'aligned' if align_forward(vfn, nr) == vfn else 'unaligned' + anon = 'anon' if anons[index] else 'file' + stats[anon][align][order] += nr + + # Account PMD-mapped THPs spearately, so filter out of the stats. There is a + # race between acquiring the smaps stats and reading pagemap, where memory + # could be deallocated. So clamp to zero incase it would have gone negative. + anon_pmd_mapped = vma.stats['AnonHugePages']['value'] + file_pmd_mapped = vma.stats['ShmemPmdMapped']['value'] + \ + vma.stats['FilePmdMapped']['value'] + stats['anon']['aligned'][PMD_ORDER] = max(0, stats['anon']['aligned'][PMD_ORDER] - kbnr(anon_pmd_mapped)) + stats['file']['aligned'][PMD_ORDER] = max(0, stats['file']['aligned'][PMD_ORDER] - kbnr(file_pmd_mapped)) + + rstats = { + f"anon-thp-pmd-aligned-{odkb(PMD_ORDER)}kB": {'type': 'anon', 'value': anon_pmd_mapped}, + f"file-thp-pmd-aligned-{odkb(PMD_ORDER)}kB": {'type': 'file', 'value': file_pmd_mapped}, + } + + def flatten_sub(type, subtype, stats): + param = f"{type}-thp-pte-{subtype}-{{}}kB" + for od, nr in enumerate(stats[2:], 2): + rstats[param.format(odkb(od))] = {'type': type, 'value': nrkb(nr)} + + def flatten_type(type, stats): + flatten_sub(type, 'aligned', stats['aligned']) + flatten_sub(type, 'unaligned', stats['unaligned']) + rstats[f"{type}-thp-pte-partial"] = {'type': type, 'value': nrkb(stats['partial'])} + + flatten_type('anon', stats['anon']) + flatten_type('file', stats['file']) + + return rstats + + +def cont_parse(vma, order, ranges, anons, heads): + # Given 4 same-sized arrays representing a range within a page table backed + # by THPs (vfns: virtual frame numbers, pfns: physical frame numbers, anons: + # True if page is anonymous, heads: True if page is head of a THP), return a + # dictionary of statistics describing the contiguous blocks. + nr_cont = 1 << order + nr_anon = 0 + nr_file = 0 + + for rindex, rvfn, rpfn in zip(*ranges): + index_next = int(rindex[0]) + index_end = int(rindex[1]) + 1 + vfn_start = int(rvfn[0]) + pfn_start = int(rpfn[0]) + + if align_offset(pfn_start, nr_cont) != align_offset(vfn_start, nr_cont): + continue + + off = align_forward(vfn_start, nr_cont) - vfn_start + index_next += off + + while index_next + nr_cont <= index_end: + folio_boundary = heads[index_next+1:index_next+nr_cont].any() + if not folio_boundary: + if anons[index_next]: + nr_anon += nr_cont + else: + nr_file += nr_cont + index_next += nr_cont + + # Account blocks that are PMD-mapped spearately, so filter out of the stats. + # There is a race between acquiring the smaps stats and reading pagemap, + # where memory could be deallocated. So clamp to zero incase it would have + # gone negative. + anon_pmd_mapped = vma.stats['AnonHugePages']['value'] + file_pmd_mapped = vma.stats['ShmemPmdMapped']['value'] + \ + vma.stats['FilePmdMapped']['value'] + nr_anon = max(0, nr_anon - kbnr(anon_pmd_mapped)) + nr_file = max(0, nr_file - kbnr(file_pmd_mapped)) + + rstats = { + f"anon-cont-pmd-aligned-{nrkb(nr_cont)}kB": {'type': 'anon', 'value': anon_pmd_mapped}, + f"file-cont-pmd-aligned-{nrkb(nr_cont)}kB": {'type': 'file', 'value': file_pmd_mapped}, + } + + rstats[f"anon-cont-pte-aligned-{nrkb(nr_cont)}kB"] = {'type': 'anon', 'value': nrkb(nr_anon)} + rstats[f"file-cont-pte-aligned-{nrkb(nr_cont)}kB"] = {'type': 'file', 'value': nrkb(nr_file)} + + return rstats + + +def vma_print(vma, pid): + # Prints a VMA instance in a format similar to smaps. The main difference is + # that the pid is included as the first value. + print("{:010d}: {:016x}-{:016x} {}{}{}{} {:08x} {:02x}:{:02x} {:08x} {}" + .format( + pid, vma.start, vma.end, + 'r' if vma.read else '-', 'w' if vma.write else '-', + 'x' if vma.execute else '-', 'p' if vma.private else 's', + vma.pgoff, vma.major, vma.minor, vma.inode, vma.name + )) + + +def stats_print(stats, tot_anon, tot_file, inc_empty): + # Print a statistics dictionary. + label_field = 32 + for label, stat in stats.items(): + type = stat['type'] + value = stat['value'] + if value or inc_empty: + pad = max(0, label_field - len(label) - 1) + if type == 'anon' and tot_anon > 0: + percent = f' ({value / tot_anon:3.0%})' + elif type == 'file' and tot_file > 0: + percent = f' ({value / tot_file:3.0%})' + else: + percent = '' + print(f"{label}:{' ' * pad}{value:8} kB{percent}") + + +def vma_parse(vma, pagemap, kpageflags, contorders): + # Generate thp and cont statistics for a single VMA. + start = vma.start >> PAGE_SHIFT + end = vma.end >> PAGE_SHIFT + + pmes = pagemap.get(start, end - start) + present = pmes & PM_PAGE_PRESENT != 0 + pfns = pmes & PM_PFN_MASK + pfns = pfns[present] + vfns = np.arange(start, end, dtype=np.uint64) + vfns = vfns[present] + + pfn_vec = cont_ranges_all([pfns], [pfns])[0] + flags = kpageflags.getv(pfn_vec) + anons = flags & KPF_ANON != 0 + heads = flags & KPF_COMPOUND_HEAD != 0 + thps = flags & KPF_THP != 0 + + vfns = vfns[thps] + pfns = pfns[thps] + anons = anons[thps] + heads = heads[thps] + + indexes = np.arange(len(vfns), dtype=np.uint64) + ranges = cont_ranges_all([vfns, pfns], [indexes, vfns, pfns]) + + thpstats = thp_parse(vma, kpageflags, ranges, indexes, vfns, pfns, anons, heads) + contstats = [cont_parse(vma, order, ranges, anons, heads) for order in contorders] + + tot_anon = vma.stats['Anonymous']['value'] + tot_file = vma.stats['Rss']['value'] - tot_anon + + return { + **thpstats, + **{k: v for s in contstats for k, v in s.items()} + }, tot_anon, tot_file + + +def do_main(args): + pids = set() + rollup = {} + rollup_anon = 0 + rollup_file = 0 + + if args.cgroup: + strict = False + for walk_info in os.walk(args.cgroup): + cgroup = walk_info[0] + with open(f'{cgroup}/cgroup.procs') as pidfile: + for line in pidfile.readlines(): + pids.add(int(line.strip())) + elif args.pid: + strict = True + pids = pids.union(args.pid) + else: + strict = False + for pid in os.listdir('/proc'): + if pid.isdigit(): + pids.add(int(pid)) + + if not args.rollup: + print(" PID START END PROT OFFSET DEV INODE OBJECT") + + for pid in pids: + try: + with PageMap(pid) as pagemap: + with KPageFlags() as kpageflags: + for vma in VMAList(pid, vma_all_stats if args.inc_smaps else vma_min_stats): + if (vma.read or vma.write or vma.execute) and vma.stats['Rss']['value'] > 0: + stats, vma_anon, vma_file = vma_parse(vma, pagemap, kpageflags, args.cont) + else: + stats = {} + vma_anon = 0 + vma_file = 0 + if args.inc_smaps: + stats = {**vma.stats, **stats} + if args.rollup: + for k, v in stats.items(): + if k in rollup: + assert(rollup[k]['type'] == v['type']) + rollup[k]['value'] += v['value'] + else: + rollup[k] = v + rollup_anon += vma_anon + rollup_file += vma_file + else: + vma_print(vma, pid) + stats_print(stats, vma_anon, vma_file, args.inc_empty) + except (FileNotFoundError, ProcessLookupError, FileIOException): + if strict: + raise + + if args.rollup: + stats_print(rollup, rollup_anon, rollup_file, args.inc_empty) + + +def main(): + docs_width = shutil.get_terminal_size().columns + docs_width -= 2 + docs_width = min(80, docs_width) + + def format(string): + text = re.sub(r'\s+', ' ', string) + text = re.sub(r'\s*\\n\s*', '\n', text) + paras = text.split('\n') + paras = [textwrap.fill(p, width=docs_width) for p in paras] + return '\n'.join(paras) + + def formatter(prog): + return argparse.RawDescriptionHelpFormatter(prog, width=docs_width) + + def size2order(human): + units = { + "K": 2**10, "M": 2**20, "G": 2**30, + "k": 2**10, "m": 2**20, "g": 2**30, + } + unit = 1 + if human[-1] in units: + unit = units[human[-1]] + human = human[:-1] + try: + size = int(human) + except ValueError: + raise ArgException('error: --cont value must be integer size with optional KMG unit') + size *= unit + order = int(math.log2(size / PAGE_SIZE)) + if order < 1: + raise ArgException('error: --cont value must be size of at least 2 pages') + if (1 << order) * PAGE_SIZE != size: + raise ArgException('error: --cont value must be size of power-of-2 pages') + if order > PMD_ORDER: + raise ArgException('error: --cont value must be less than or equal to PMD order') + return order + + parser = argparse.ArgumentParser(formatter_class=formatter, + description=format("""Prints information about how transparent huge + pages are mapped, either system-wide, or for a specified + process or cgroup.\\n + \\n + When run with --pid, the user explicitly specifies the set + of pids to scan. e.g. "--pid 10 [--pid 134 ...]". When run + with --cgroup, the user passes either a v1 or v2 cgroup and + all pids that belong to the cgroup subtree are scanned. When + run with neither --pid nor --cgroup, the full set of pids on + the system is gathered from /proc and scanned as if the user + had provided "--pid 1 --pid 2 ...".\\n + \\n + A default set of statistics is always generated for THP + mappings. However, it is also possible to generate + additional statistics for "contiguous block mappings" where + the block size is user-defined.\\n + \\n + Statistics are maintained independently for anonymous and + file-backed (pagecache) memory and are shown both in kB and + as a percentage of either total anonymous or total + file-backed memory as appropriate.\\n + \\n + THP Statistics\\n + --------------\\n + \\n + Statistics are always generated for fully- and + contiguously-mapped THPs whose mapping address is aligned to + their size, for each supported by the system. + Separate counters describe THPs mapped by PTE vs those + mapped by PMD. (Although note a THP can only be mapped by + PMD if it is PMD-sized):\\n + \\n + - anon-thp-pte-aligned-kB\\n + - file-thp-pte-aligned-kB\\n + - anon-thp-pmd-aligned-kB\\n + - file-thp-pmd-aligned-kB\\n + \\n + Similarly, statistics are always generated for fully- and + contiguously-mapped THPs whose mapping address is *not* + aligned to their size, for each supported by the + system. Due to the unaligned mapping, it is impossible to + map by PMD, so there are only PTE counters for this case:\\n + \\n + - anon-thp-pte-unaligned-kB\\n + - file-thp-pte-unaligned-kB\\n + \\n + Statistics are also always generated for mapped pages that + belong to a THP but where the is THP is *not* fully- and + contiguously- mapped. These "partial" mappings are all + counted in the same counter regardless of the size of the + THP that is partially mapped:\\n + \\n + - anon-thp-pte-partial\\n + - file-thp-pte-partial\\n + \\n + Contiguous Block Statistics\\n + ---------------------------\\n + \\n + An optional, additional set of statistics is generated for + every contiguous block size specified with `--cont `. + These statistics show how much memory is mapped in + contiguous blocks of and also aligned to . A + given contiguous block must all belong to the same THP, but + there is no requirement for it to be the *whole* THP. + Separate counters describe contiguous blocks mapped by PTE + vs those mapped by PMD:\\n + \\n + - anon-cont-pte-aligned-kB\\n + - file-cont-pte-aligned-kB\\n + - anon-cont-pmd-aligned-kB\\n + - file-cont-pmd-aligned-kB\\n + \\n + As an example, if monitoring 64K contiguous blocks (--cont + 64K), there are a number of sources that could provide such + blocks: a fully- and contiguously-mapped 64K THP that is + aligned to a 64K boundary would provide 1 block. A fully- + and contiguously-mapped 128K THP that is aligned to at least + a 64K boundary would provide 2 blocks. Or a 128K THP that + maps its first 100K, but contiguously and starting at a 64K + boundary would provide 1 block. A fully- and + contiguously-mapped 2M THP would provide 32 blocks. There + are many other possible permutations.\\n"""), + epilog=format("""Requires root privilege to access pagemap and + kpageflags.""")) + + group = parser.add_mutually_exclusive_group(required=False) + group.add_argument('--pid', + metavar='pid', required=False, type=int, default=[], action='append', + help="""Process id of the target process. Maybe issued multiple times to + scan multiple processes. --pid and --cgroup are mutually exclusive. + If neither are provided, all processes are scanned to provide + system-wide information.""") + + group.add_argument('--cgroup', + metavar='path', required=False, + help="""Path to the target cgroup in sysfs. Iterates over every pid in + the cgroup and its children. --pid and --cgroup are mutually + exclusive. If neither are provided, all processes are scanned to + provide system-wide information.""") + + parser.add_argument('--rollup', + required=False, default=False, action='store_true', + help="""Sum the per-vma statistics to provide a summary over the whole + system, process or cgroup.""") + + parser.add_argument('--cont', + metavar='size[KMG]', required=False, default=[], action='append', + help="""Adds stats for memory that is mapped in contiguous blocks of + and also aligned to . May be issued multiple times to + track multiple sized blocks. Useful to infer e.g. arm64 contpte and + hpa mappings. Size must be a power-of-2 number of pages.""") + + parser.add_argument('--inc-smaps', + required=False, default=False, action='store_true', + help="""Include all numerical, additive /proc//smaps stats in the + output.""") + + parser.add_argument('--inc-empty', + required=False, default=False, action='store_true', + help="""Show all statistics including those whose value is 0.""") + + parser.add_argument('--periodic', + metavar='sleep_ms', required=False, type=int, + help="""Run in a loop, polling every sleep_ms milliseconds.""") + + args = parser.parse_args() + + try: + args.cont = [size2order(cont) for cont in args.cont] + except ArgException as e: + parser.print_usage() + raise + + if args.periodic: + while True: + do_main(args) + print() + time.sleep(args.periodic / 1000) + else: + do_main(args) + + +if __name__ == "__main__": + try: + main() + except Exception as e: + prog = os.path.basename(sys.argv[0]) + print(f'{prog}: {e}') + exit(1) From 7876e99dd59f1d1257a4349986d9b52e3e088853 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 17 Jan 2024 18:39:54 +0800 Subject: [PATCH 462/707] mm: memory: move mem_cgroup_charge() into alloc_anon_folio() The GFP flags from vma_thp_gfp_mask() according to user configuration only used for large folio allocation but not for memory cgroup charge, and GFP_KERNEL is used for both order-0 and large order folio when memory cgroup charge at present. However, mem_cgroup_charge() uses the GFP flags in a fairly sophisticated way. In addition to checking gfpflags_allow_blocking(), it pays attention to __GFP_NORETRY and __GFP_RETRY_MAYFAIL to ensure that processes within this memcg do not exceed their quotas. So we'd better to move mem_cgroup_charge() into alloc_anon_folio(), 1) it will make us to allocate as much as possible large order folio, because we could try the next order if mem_cgroup_charge() fails, although the memcg's memory usage is close to its limits. 2) using same GFP flags for allocation and charge is to be consistent with PMD THP firstly, in addition, according to GFP flag returned from vma_thp_gfp_mask(), GFP_TRANSHUGE_LIGHT could make us skip direct reclaim, _GFP_NORETRY will make us skip mem_cgroup_oom() and won't trigger memory cgroup oom from large order(order <= COSTLY_ORDER) folio charging. Link: https://lkml.kernel.org/r/20240122011612.501029-1-wangkefeng.wang@huawei.com Link: https://lkml.kernel.org/r/20240117103954.2756050-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: Ryan Roberts Cc: David Hildenbrand Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Roman Gushchin Cc: Johannes Weiner Cc: Shakeel Butt Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/memory.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index f5750bfcdc058f..8d14ba44092965 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4153,8 +4153,8 @@ static bool pte_range_none(pte_t *pte, int nr_pages) static struct folio *alloc_anon_folio(struct vm_fault *vmf) { -#ifdef CONFIG_TRANSPARENT_HUGEPAGE struct vm_area_struct *vma = vmf->vma; +#ifdef CONFIG_TRANSPARENT_HUGEPAGE unsigned long orders; struct folio *folio; unsigned long addr; @@ -4206,15 +4206,21 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf) addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order); folio = vma_alloc_folio(gfp, order, vma, addr, true); if (folio) { + if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) { + folio_put(folio); + goto next; + } + folio_throttle_swaprate(folio, gfp); clear_huge_page(&folio->page, vmf->address, 1 << order); return folio; } +next: order = next_order(&orders, order); } fallback: #endif - return vma_alloc_zeroed_movable_folio(vmf->vma, vmf->address); + return folio_prealloc(vma->vm_mm, vma, vmf->address, true); } /* @@ -4281,10 +4287,6 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) nr_pages = folio_nr_pages(folio); addr = ALIGN_DOWN(vmf->address, nr_pages * PAGE_SIZE); - if (mem_cgroup_charge(folio, vma->vm_mm, GFP_KERNEL)) - goto oom_free_page; - folio_throttle_swaprate(folio, GFP_KERNEL); - /* * The memory barrier inside __folio_mark_uptodate makes sure that * preceding stores to the page contents become visible before @@ -4338,8 +4340,6 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) release: folio_put(folio); goto unlock; -oom_free_page: - folio_put(folio); oom: return VM_FAULT_OOM; } From 3648cefe4682802156f45b64fafbcb7acf947a1c Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 18 Jan 2024 18:42:35 +0000 Subject: [PATCH 463/707] mm: writeback: ratelimit stat flush from mem_cgroup_wb_stats One of our workloads (Postgres 14) has regressed when migrated from 5.10 to 6.1 upstream kernel. The regression can be reproduced by sysbench's oltp_write_only benchmark. It seems like the always on rstat flush in mem_cgroup_wb_stats() is causing the regression. So, rate limit that specific rstat flush. One potential consequence would be the dirty throttling might be decided on stale memcg stats. However from our benchmarks and production traffic we have not observed any change in the dirty throttling behavior of the application. Link: https://lkml.kernel.org/r/20240118184235.618164-1-shakeelb@google.com Fixes: 2d146aa3aa84 ("mm: memcontrol: switch to rstat") Signed-off-by: Shakeel Butt Acked-by: Johannes Weiner Acked-by: Roman Gushchin Cc: Jan Kara Cc: Jens Axboe Cc: Michal Hocko Cc: Muchun Song Cc: Tejun Heo Signed-off-by: Andrew Morton --- mm/memcontrol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6cc6b3a2a60c97..391ecdc5af68a0 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4801,7 +4801,7 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); struct mem_cgroup *parent; - mem_cgroup_flush_stats(memcg); + mem_cgroup_flush_stats_ratelimited(memcg); *pdirty = memcg_page_state(memcg, NR_FILE_DIRTY); *pwriteback = memcg_page_state(memcg, NR_WRITEBACK); From 12081a7ff341044138ac73f3c14406dbbe0d882a Mon Sep 17 00:00:00 2001 From: Greg Thelen Date: Thu, 18 Jan 2024 01:50:57 -0800 Subject: [PATCH 464/707] selftests/memfd: delete unused declarations Commit 32d118ad50a5 ("selftests/memfd: add tests for F_SEAL_EXEC"): - added several unused 'nbytes' local variables Commit 6469b66e3f5a ("selftests: improve vm.memfd_noexec sysctl tests"): - orphaned 'newpid_thread_fn2()' forward declaration - orphaned 'join_newpid_thread()' forward declaration - added unused 'pid' local in sysctl_simple_child() - orphaned 'fd' local in sysctl_simple_child() - added unused 'fd' in sysctl_nested_child() Delete the unused locals and forward declarations. Link: https://lkml.kernel.org/r/20240118095057.677544-1-gthelen@google.com Signed-off-by: Greg Thelen Cc: Aleksa Sarai Cc: Daniel Verkamp Cc: Jeff Xu Cc: Kees Cook Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/memfd/memfd_test.c | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/tools/testing/selftests/memfd/memfd_test.c b/tools/testing/selftests/memfd/memfd_test.c index 3df00867723910..18f585684e2025 100644 --- a/tools/testing/selftests/memfd/memfd_test.c +++ b/tools/testing/selftests/memfd/memfd_test.c @@ -44,8 +44,6 @@ */ static size_t mfd_def_size = MFD_DEF_SIZE; static const char *memfd_str = MEMFD_STR; -static int newpid_thread_fn2(void *arg); -static void join_newpid_thread(pid_t pid); static ssize_t fd2name(int fd, char *buf, size_t bufsize) { @@ -194,7 +192,6 @@ static unsigned int mfd_assert_get_seals(int fd) static void mfd_assert_has_seals(int fd, unsigned int seals) { char buf[PATH_MAX]; - int nbytes; unsigned int s; fd2name(fd, buf, PATH_MAX); @@ -696,7 +693,6 @@ static void mfd_assert_mode(int fd, int mode) { struct stat st; char buf[PATH_MAX]; - int nbytes; fd2name(fd, buf, PATH_MAX); @@ -715,7 +711,6 @@ static void mfd_assert_mode(int fd, int mode) static void mfd_assert_chmod(int fd, int mode) { char buf[PATH_MAX]; - int nbytes; fd2name(fd, buf, PATH_MAX); @@ -731,7 +726,6 @@ static void mfd_fail_chmod(int fd, int mode) { struct stat st; char buf[PATH_MAX]; - int nbytes; fd2name(fd, buf, PATH_MAX); @@ -1254,9 +1248,6 @@ static void test_sysctl_set_sysctl2(void) static int sysctl_simple_child(void *arg) { - int fd; - int pid; - printf("%s sysctl 0\n", memfd_str); test_sysctl_set_sysctl0(); @@ -1321,7 +1312,6 @@ static void test_sysctl_sysctl2_failset(void) static int sysctl_nested_child(void *arg) { - int fd; int pid; printf("%s nested sysctl 0\n", memfd_str); From 2e7e97097d77028ced808783136a95e8f2e337fe Mon Sep 17 00:00:00 2001 From: Lokesh Gidra Date: Wed, 17 Jan 2024 14:39:21 -0800 Subject: [PATCH 465/707] userfaultfd: fix return error if mmap_changing is non-zero in MOVE ioctl To be consistent with other uffd ioctl's returning EAGAIN when mmap_changing is detected, we should change UFFDIO_MOVE to do the same. Link: https://lkml.kernel.org/r/20240117223922.1445327-1-lokeshgidra@google.com Signed-off-by: Lokesh Gidra Acked-by: Suren Baghdasaryan Acked-by: Mike Rapoport (IBM) Cc: Andrea Arcangeli Cc: Axel Rasmussen Cc: Brian Geffon Cc: David Hildenbrand Cc: Jann Horn Cc: Kalesh Singh Cc: Matthew Wilcox (Oracle) Cc: Nicolas Geoffray Cc: Peter Xu Signed-off-by: Andrew Morton --- fs/userfaultfd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 959551ff9a9514..05c8e8a054272f 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -2047,7 +2047,7 @@ static int userfaultfd_move(struct userfaultfd_ctx *ctx, ret = move_pages(ctx, mm, uffdio_move.dst, uffdio_move.src, uffdio_move.len, uffdio_move.mode); else - ret = -EINVAL; + ret = -EAGAIN; mmap_read_unlock(mm); mmput(mm); From fd09a0ce540dedca104b19735fbf7e2bef9e1101 Mon Sep 17 00:00:00 2001 From: Nico Pache Date: Wed, 17 Jan 2024 11:00:37 -0700 Subject: [PATCH 466/707] selftests: mm: perform some system cleanup before using hugepages When running with CATEGORY= (thp | hugetlb) we see a large numbers of tests failing. These failures are due to not being able to allocate a hugepage and normally occur on memory contrainted systems or when using large page sizes. drop_cache and compact_memory before the tests for a higher chance at a successful hugepage allocation. Link: https://lkml.kernel.org/r/20240117180037.15734-1-npache@redhat.com Signed-off-by: Nico Pache Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/run_vmtests.sh | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh index 246d53a5d7f287..040f27e21f47a3 100755 --- a/tools/testing/selftests/mm/run_vmtests.sh +++ b/tools/testing/selftests/mm/run_vmtests.sh @@ -206,6 +206,15 @@ pretty_name() { # Usage: run_test [test binary] [arbitrary test arguments...] run_test() { if test_selected ${CATEGORY}; then + # On memory constrainted systems some tests can fail to allocate hugepages. + # perform some cleanup before the test for a higher success rate. + if [ ${CATEGORY} == "thp" ] | [ ${CATEGORY} == "hugetlb" ]; then + echo 3 > /proc/sys/vm/drop_caches + sleep 2 + echo 1 > /proc/sys/vm/compact_memory + sleep 2 + fi + local test=$(pretty_name "$*") local title="running $*" local sep=$(echo -n "$title" | tr "[:graph:][:space:]" -) From e3ea1b826e160639ba53d117f438798bb0e1538f Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Mon, 22 Jan 2024 11:20:00 +0100 Subject: [PATCH 467/707] maple_tree: avoid duplicate variable init in mast_spanning_rebalance() The local variables r_tmp and l_tmp in mast_spanning_rebalance() are already initialized at its declaration; there is no need to assign the value again. Remove the duplicate initialization of {r,l}_tmp. No functional change. Due to common compiler optimizations, also no change to object code. This issue was identified with clang-analyzer's dead stores analysis. Link: https://lkml.kernel.org/r/20240122102000.29558-1-lukas.bulwahn@gmail.com Signed-off-by: Lukas Bulwahn Reviewed-by: Liam R. Howlett Signed-off-by: Andrew Morton --- lib/maple_tree.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 7b161802860bdb..82fb5195c2354f 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -2271,8 +2271,6 @@ bool mast_spanning_rebalance(struct maple_subtree_state *mast) struct ma_state l_tmp = *mast->orig_l; unsigned char depth = 0; - r_tmp = *mast->orig_r; - l_tmp = *mast->orig_l; do { mas_ascend(mast->orig_r); mas_ascend(mast->orig_l); From 11c2253562339fef6868973ffcf5c0071a5cd3af Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Mon, 22 Jan 2024 10:25:04 +0100 Subject: [PATCH 468/707] mempolicy: clean up minor dead code in queue_pages_test_walk() Commit 2cafb582173f ("mempolicy: remove confusing MPOL_MF_LAZY dead code") removes MPOL_MF_LAZY handling in queue_pages_test_walk(), and with that, there is no effective use of the local variable endvma in that function remaining. Remove the local variable endvma and its dead code. No functional change. This issue was identified with clang-analyzer's dead stores analysis. Link: https://lkml.kernel.org/r/20240122092504.18377-1-lukas.bulwahn@gmail.com Signed-off-by: Lukas Bulwahn Cc: Hugh Dickins Signed-off-by: Andrew Morton --- mm/mempolicy.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 10a590ee1c8997..5e519163c4dcb6 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -654,7 +654,6 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end, { struct vm_area_struct *next, *vma = walk->vma; struct queue_pages *qp = walk->private; - unsigned long endvma = vma->vm_end; unsigned long flags = qp->flags; /* range check first */ @@ -682,9 +681,6 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end, !(flags & MPOL_MF_STRICT)) return 1; - if (endvma > end) - endvma = end; - /* * Check page nodes, and queue pages to move, in the current vma. * But if no moving, and no strict checking, the scan can be skipped. From 0b5f17a6861eceef820ccf2b2e7c152d006114c9 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Wed, 24 Jan 2024 13:12:41 +0800 Subject: [PATCH 469/707] kexec: split crashkernel reservation code out from crash_core.c Patch series "Split crash out from kexec and clean up related config items", v3. Motivation: ============= Previously, LKP reported a building error. When investigating, it can't be resolved reasonablly with the present messy kdump config items. https://lore.kernel.org/oe-kbuild-all/202312182200.Ka7MzifQ-lkp@intel.com/ The kdump (crash dumping) related config items could causes confusions: Firstly, CRASH_CORE enables codes including - crashkernel reservation; - elfcorehdr updating; - vmcoreinfo exporting; - crash hotplug handling; Now fadump of powerpc, kcore dynamic debugging and kdump all selects CRASH_CORE, while fadump - fadump needs crashkernel parsing, vmcoreinfo exporting, and accessing global variable 'elfcorehdr_addr'; - kcore only needs vmcoreinfo exporting; - kdump needs all of the current kernel/crash_core.c. So only enabling PROC_CORE or FA_DUMP will enable CRASH_CORE, this mislead people that we enable crash dumping, actual it's not. Secondly, It's not reasonable to allow KEXEC_CORE select CRASH_CORE. Because KEXEC_CORE enables codes which allocate control pages, copy kexec/kdump segments, and prepare for switching. These codes are shared by both kexec reboot and kdump. We could want kexec reboot, but disable kdump. In that case, CRASH_CORE should not be selected. -------------------- CONFIG_CRASH_CORE=y CONFIG_KEXEC_CORE=y CONFIG_KEXEC=y CONFIG_KEXEC_FILE=y --------------------- Thirdly, It's not reasonable to allow CRASH_DUMP select KEXEC_CORE. That could make KEXEC_CORE, CRASH_DUMP are enabled independently from KEXEC or KEXEC_FILE. However, w/o KEXEC or KEXEC_FILE, the KEXEC_CORE code built in doesn't make any sense because no kernel loading or switching will happen to utilize the KEXEC_CORE code. --------------------- CONFIG_CRASH_CORE=y CONFIG_KEXEC_CORE=y CONFIG_CRASH_DUMP=y --------------------- In this case, what is worse, on arch sh and arm, KEXEC relies on MMU, while CRASH_DUMP can still be enabled when !MMU, then compiling error is seen as the lkp test robot reported in above link. ------arch/sh/Kconfig------ config ARCH_SUPPORTS_KEXEC def_bool MMU config ARCH_SUPPORTS_CRASH_DUMP def_bool BROKEN_ON_SMP --------------------------- Changes: =========== 1, split out crash_reserve.c from crash_core.c; 2, split out vmcore_infoc. from crash_core.c; 3, move crash related codes in kexec_core.c into crash_core.c; 4, remove dependency of FA_DUMP on CRASH_DUMP; 5, clean up kdump related config items; 6, wrap up crash codes in crash related ifdefs on all 8 arch-es which support crash dumping, except of ppc; Achievement: =========== With above changes, I can rearrange the config item logic as below (the right item depends on or is selected by the left item): PROC_KCORE -----------> VMCORE_INFO |----------> VMCORE_INFO FA_DUMP----| |----------> CRASH_RESERVE ---->VMCORE_INFO / |---->CRASH_RESERVE KEXEC --| /| |--> KEXEC_CORE--> CRASH_DUMP-->/-|---->PROC_VMCORE KEXEC_FILE --| \ | \---->CRASH_HOTPLUG KEXEC --| |--> KEXEC_CORE (for kexec reboot only) KEXEC_FILE --| Test ======== On all 8 architectures, including x86_64, arm64, s390x, sh, arm, mips, riscv, loongarch, I did below three cases of config item setting and building all passed. Take configs on x86_64 as exampmle here: (1) Both CONFIG_KEXEC and KEXEC_FILE is unset, then all kexec/kdump items are unset automatically: # Kexec and crash features # CONFIG_KEXEC is not set # CONFIG_KEXEC_FILE is not set # end of Kexec and crash features (2) set CONFIG_KEXEC_FILE and 'make olddefconfig': --------------- # Kexec and crash features CONFIG_CRASH_RESERVE=y CONFIG_VMCORE_INFO=y CONFIG_KEXEC_CORE=y CONFIG_KEXEC_FILE=y CONFIG_CRASH_DUMP=y CONFIG_CRASH_HOTPLUG=y CONFIG_CRASH_MAX_MEMORY_RANGES=8192 # end of Kexec and crash features --------------- (3) unset CONFIG_CRASH_DUMP in case 2 and execute 'make olddefconfig': ------------------------ # Kexec and crash features CONFIG_KEXEC_CORE=y CONFIG_KEXEC_FILE=y # end of Kexec and crash features ------------------------ Note: For ppc, it needs investigation to make clear how to split out crash code in arch folder. Hope Hari and Pingfan can help have a look, see if it's doable. Now, I make it either have both kexec and crash enabled, or disable both of them altogether. This patch (of 14): Both kdump and fa_dump of ppc rely on crashkernel reservation. Move the relevant codes into separate files: crash_reserve.c, include/linux/crash_reserve.h. And also add config item CRASH_RESERVE to control its enabling of the codes. And update config items which has relationship with crashkernel reservation. And also change ifdeffery from CONFIG_CRASH_CORE to CONFIG_CRASH_RESERVE when those scopes are only crashkernel reservation related. And also rename arch/XXX/include/asm/{crash_core.h => crash_reserve.h} on arm64, x86 and risc-v because those architectures' crash_core.h is only related to crashkernel reservation. Link: https://lkml.kernel.org/r/20240124051254.67105-1-bhe@redhat.com Link: https://lkml.kernel.org/r/20240124051254.67105-2-bhe@redhat.com Signed-off-by: Baoquan He Cc: Al Viro Cc: Eric W. Biederman Cc: Hari Bathini Cc: Pingfan Liu Signed-off-by: Andrew Morton --- arch/arm64/Kconfig | 2 +- .../asm/{crash_core.h => crash_reserve.h} | 4 +- arch/powerpc/Kconfig | 1 + arch/powerpc/mm/nohash/kaslr_booke.c | 4 +- arch/riscv/Kconfig | 2 +- .../asm/{crash_core.h => crash_reserve.h} | 4 +- arch/x86/Kconfig | 2 +- .../asm/{crash_core.h => crash_reserve.h} | 6 +- include/linux/crash_core.h | 40 -- include/linux/crash_reserve.h | 48 ++ include/linux/kexec.h | 1 + kernel/Kconfig.kexec | 5 +- kernel/Makefile | 1 + kernel/crash_core.c | 438 ----------------- kernel/crash_reserve.c | 464 ++++++++++++++++++ 15 files changed, 531 insertions(+), 491 deletions(-) rename arch/arm64/include/asm/{crash_core.h => crash_reserve.h} (81%) rename arch/riscv/include/asm/{crash_core.h => crash_reserve.h} (78%) rename arch/x86/include/asm/{crash_core.h => crash_reserve.h} (92%) create mode 100644 include/linux/crash_reserve.h create mode 100644 kernel/crash_reserve.c diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index ad5dbaf3dc9f57..d86d7f4758b54f 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1520,7 +1520,7 @@ config ARCH_SUPPORTS_CRASH_DUMP def_bool y config ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION - def_bool CRASH_CORE + def_bool CRASH_RESERVE config TRANS_TABLE def_bool y diff --git a/arch/arm64/include/asm/crash_core.h b/arch/arm64/include/asm/crash_reserve.h similarity index 81% rename from arch/arm64/include/asm/crash_core.h rename to arch/arm64/include/asm/crash_reserve.h index 9f5c8d339f44f5..4afe027a4e7b2c 100644 --- a/arch/arm64/include/asm/crash_core.h +++ b/arch/arm64/include/asm/crash_reserve.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0-only */ -#ifndef _ARM64_CRASH_CORE_H -#define _ARM64_CRASH_CORE_H +#ifndef _ARM64_CRASH_RESERVE_H +#define _ARM64_CRASH_RESERVE_H /* Current arm64 boot protocol requires 2MB alignment */ #define CRASH_ALIGN SZ_2M diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index b9fc064d38d281..7f704ae5c5efcb 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -691,6 +691,7 @@ config FA_DUMP bool "Firmware-assisted dump" depends on PPC64 && (PPC_RTAS || PPC_POWERNV) select CRASH_CORE + select CRASH_RESERVE select CRASH_DUMP help A robust mechanism to get reliable kernel crash dump with diff --git a/arch/powerpc/mm/nohash/kaslr_booke.c b/arch/powerpc/mm/nohash/kaslr_booke.c index b4f2786a7d2b0b..cdff129abb1446 100644 --- a/arch/powerpc/mm/nohash/kaslr_booke.c +++ b/arch/powerpc/mm/nohash/kaslr_booke.c @@ -13,7 +13,7 @@ #include #include #include -#include +#include #include #include #include @@ -173,7 +173,7 @@ static __init bool overlaps_region(const void *fdt, u32 start, static void __init get_crash_kernel(void *fdt, unsigned long size) { -#ifdef CONFIG_CRASH_CORE +#ifdef CONFIG_CRASH_RESERVE unsigned long long crash_size, crash_base; int ret; diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index bffbd869a06828..bd06ad1bb97cbb 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -767,7 +767,7 @@ config ARCH_SUPPORTS_CRASH_DUMP def_bool y config ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION - def_bool CRASH_CORE + def_bool CRASH_RESERVE config COMPAT bool "Kernel support for 32-bit U-mode" diff --git a/arch/riscv/include/asm/crash_core.h b/arch/riscv/include/asm/crash_reserve.h similarity index 78% rename from arch/riscv/include/asm/crash_core.h rename to arch/riscv/include/asm/crash_reserve.h index e1874b23feaf11..013962e63587f3 100644 --- a/arch/riscv/include/asm/crash_core.h +++ b/arch/riscv/include/asm/crash_reserve.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0-only */ -#ifndef _RISCV_CRASH_CORE_H -#define _RISCV_CRASH_CORE_H +#ifndef _RISCV_CRASH_RESERVE_H +#define _RISCV_CRASH_RESERVE_H #define CRASH_ALIGN PMD_SIZE diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 5edec175b9bfc9..71417c5b228c51 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2106,7 +2106,7 @@ config ARCH_SUPPORTS_CRASH_HOTPLUG def_bool y config ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION - def_bool CRASH_CORE + def_bool CRASH_RESEERVE config PHYSICAL_START hex "Physical address where the kernel is loaded" if (EXPERT || CRASH_DUMP) diff --git a/arch/x86/include/asm/crash_core.h b/arch/x86/include/asm/crash_reserve.h similarity index 92% rename from arch/x86/include/asm/crash_core.h rename to arch/x86/include/asm/crash_reserve.h index 76af98f4e80126..152239f9554195 100644 --- a/arch/x86/include/asm/crash_core.h +++ b/arch/x86/include/asm/crash_reserve.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _X86_CRASH_CORE_H -#define _X86_CRASH_CORE_H +#ifndef _X86_CRASH_RESERVE_H +#define _X86_CRASH_RESERVE_H /* 16M alignment for crash kernel regions */ #define CRASH_ALIGN SZ_16M @@ -39,4 +39,4 @@ static inline unsigned long crash_low_size_default(void) #endif } -#endif /* _X86_CRASH_CORE_H */ +#endif /* _X86_CRASH_RESERVE_H */ diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h index 9eaeaafe0cad3a..1fde49246fa6e3 100644 --- a/include/linux/crash_core.h +++ b/include/linux/crash_core.h @@ -5,14 +5,6 @@ #include #include #include -#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION -#include -#endif - -/* Location of a reserved region to hold the crash kernel. - */ -extern struct resource crashk_res; -extern struct resource crashk_low_res; #define CRASH_CORE_NOTE_NAME "CORE" #define CRASH_CORE_NOTE_HEAD_BYTES ALIGN(sizeof(struct elf_note), 4) @@ -87,38 +79,6 @@ Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type, void *data, size_t data_len); void final_note(Elf_Word *buf); -int __init parse_crashkernel(char *cmdline, unsigned long long system_ram, - unsigned long long *crash_size, unsigned long long *crash_base, - unsigned long long *low_size, bool *high); - -#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION -#ifndef DEFAULT_CRASH_KERNEL_LOW_SIZE -#define DEFAULT_CRASH_KERNEL_LOW_SIZE (128UL << 20) -#endif -#ifndef CRASH_ALIGN -#define CRASH_ALIGN SZ_2M -#endif -#ifndef CRASH_ADDR_LOW_MAX -#define CRASH_ADDR_LOW_MAX SZ_4G -#endif -#ifndef CRASH_ADDR_HIGH_MAX -#define CRASH_ADDR_HIGH_MAX memblock_end_of_DRAM() -#endif - -void __init reserve_crashkernel_generic(char *cmdline, - unsigned long long crash_size, - unsigned long long crash_base, - unsigned long long crash_low_size, - bool high); -#else -static inline void __init reserve_crashkernel_generic(char *cmdline, - unsigned long long crash_size, - unsigned long long crash_base, - unsigned long long crash_low_size, - bool high) -{} -#endif - /* Alignment required for elf header segment */ #define ELF_CORE_HEADER_ALIGN 4096 diff --git a/include/linux/crash_reserve.h b/include/linux/crash_reserve.h new file mode 100644 index 00000000000000..5a9df944fb806a --- /dev/null +++ b/include/linux/crash_reserve.h @@ -0,0 +1,48 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef LINUX_CRASH_RESERVE_H +#define LINUX_CRASH_RESERVE_H + +#include +#include +#include +#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION +#include +#endif + +/* Location of a reserved region to hold the crash kernel. + */ +extern struct resource crashk_res; +extern struct resource crashk_low_res; + +int __init parse_crashkernel(char *cmdline, unsigned long long system_ram, + unsigned long long *crash_size, unsigned long long *crash_base, + unsigned long long *low_size, bool *high); + +#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION +#ifndef DEFAULT_CRASH_KERNEL_LOW_SIZE +#define DEFAULT_CRASH_KERNEL_LOW_SIZE (128UL << 20) +#endif +#ifndef CRASH_ALIGN +#define CRASH_ALIGN SZ_2M +#endif +#ifndef CRASH_ADDR_LOW_MAX +#define CRASH_ADDR_LOW_MAX SZ_4G +#endif +#ifndef CRASH_ADDR_HIGH_MAX +#define CRASH_ADDR_HIGH_MAX memblock_end_of_DRAM() +#endif + +void __init reserve_crashkernel_generic(char *cmdline, + unsigned long long crash_size, + unsigned long long crash_base, + unsigned long long crash_low_size, + bool high); +#else +static inline void __init reserve_crashkernel_generic(char *cmdline, + unsigned long long crash_size, + unsigned long long crash_base, + unsigned long long crash_low_size, + bool high) +{} +#endif +#endif /* LINUX_CRASH_RESERVE_H */ diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 400cb6c02176e0..6d79bfb52e5bf0 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -16,6 +16,7 @@ #if !defined(__ASSEMBLY__) #include +#include #include #include diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec index 946dffa048b74c..8b7be71edd859e 100644 --- a/kernel/Kconfig.kexec +++ b/kernel/Kconfig.kexec @@ -2,11 +2,15 @@ menu "Kexec and crash features" +config CRASH_RESERVE + bool + config CRASH_CORE bool config KEXEC_CORE select CRASH_CORE + select CRASH_RESERVE bool config KEXEC_ELF @@ -96,7 +100,6 @@ config KEXEC_JUMP config CRASH_DUMP bool "kernel crash dumps" depends on ARCH_SUPPORTS_CRASH_DUMP - select CRASH_CORE select KEXEC_CORE help Generate crash dump after being started by kexec. diff --git a/kernel/Makefile b/kernel/Makefile index ce105a5558fcfa..05fa88b3ab7499 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -69,6 +69,7 @@ obj-$(CONFIG_KALLSYMS) += kallsyms.o obj-$(CONFIG_KALLSYMS_SELFTEST) += kallsyms_selftest.o obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o obj-$(CONFIG_CRASH_CORE) += crash_core.o +obj-$(CONFIG_CRASH_RESERVE) += crash_reserve.o obj-$(CONFIG_KEXEC_CORE) += kexec_core.o obj-$(CONFIG_KEXEC) += kexec.o obj-$(CONFIG_KEXEC_FILE) += kexec_file.o diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 49b31e59d3ccd1..ae0d1ce89b46b8 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -34,444 +34,6 @@ u32 *vmcoreinfo_note; /* trusted vmcoreinfo, e.g. we can make a copy in the crash memory */ static unsigned char *vmcoreinfo_data_safecopy; -/* Location of the reserved area for the crash kernel */ -struct resource crashk_res = { - .name = "Crash kernel", - .start = 0, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM, - .desc = IORES_DESC_CRASH_KERNEL -}; -struct resource crashk_low_res = { - .name = "Crash kernel", - .start = 0, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM, - .desc = IORES_DESC_CRASH_KERNEL -}; - -/* - * parsing the "crashkernel" commandline - * - * this code is intended to be called from architecture specific code - */ - - -/* - * This function parses command lines in the format - * - * crashkernel=ramsize-range:size[,...][@offset] - * - * The function returns 0 on success and -EINVAL on failure. - */ -static int __init parse_crashkernel_mem(char *cmdline, - unsigned long long system_ram, - unsigned long long *crash_size, - unsigned long long *crash_base) -{ - char *cur = cmdline, *tmp; - unsigned long long total_mem = system_ram; - - /* - * Firmware sometimes reserves some memory regions for its own use, - * so the system memory size is less than the actual physical memory - * size. Work around this by rounding up the total size to 128M, - * which is enough for most test cases. - */ - total_mem = roundup(total_mem, SZ_128M); - - /* for each entry of the comma-separated list */ - do { - unsigned long long start, end = ULLONG_MAX, size; - - /* get the start of the range */ - start = memparse(cur, &tmp); - if (cur == tmp) { - pr_warn("crashkernel: Memory value expected\n"); - return -EINVAL; - } - cur = tmp; - if (*cur != '-') { - pr_warn("crashkernel: '-' expected\n"); - return -EINVAL; - } - cur++; - - /* if no ':' is here, than we read the end */ - if (*cur != ':') { - end = memparse(cur, &tmp); - if (cur == tmp) { - pr_warn("crashkernel: Memory value expected\n"); - return -EINVAL; - } - cur = tmp; - if (end <= start) { - pr_warn("crashkernel: end <= start\n"); - return -EINVAL; - } - } - - if (*cur != ':') { - pr_warn("crashkernel: ':' expected\n"); - return -EINVAL; - } - cur++; - - size = memparse(cur, &tmp); - if (cur == tmp) { - pr_warn("Memory value expected\n"); - return -EINVAL; - } - cur = tmp; - if (size >= total_mem) { - pr_warn("crashkernel: invalid size\n"); - return -EINVAL; - } - - /* match ? */ - if (total_mem >= start && total_mem < end) { - *crash_size = size; - break; - } - } while (*cur++ == ','); - - if (*crash_size > 0) { - while (*cur && *cur != ' ' && *cur != '@') - cur++; - if (*cur == '@') { - cur++; - *crash_base = memparse(cur, &tmp); - if (cur == tmp) { - pr_warn("Memory value expected after '@'\n"); - return -EINVAL; - } - } - } else - pr_info("crashkernel size resulted in zero bytes\n"); - - return 0; -} - -/* - * That function parses "simple" (old) crashkernel command lines like - * - * crashkernel=size[@offset] - * - * It returns 0 on success and -EINVAL on failure. - */ -static int __init parse_crashkernel_simple(char *cmdline, - unsigned long long *crash_size, - unsigned long long *crash_base) -{ - char *cur = cmdline; - - *crash_size = memparse(cmdline, &cur); - if (cmdline == cur) { - pr_warn("crashkernel: memory value expected\n"); - return -EINVAL; - } - - if (*cur == '@') - *crash_base = memparse(cur+1, &cur); - else if (*cur != ' ' && *cur != '\0') { - pr_warn("crashkernel: unrecognized char: %c\n", *cur); - return -EINVAL; - } - - return 0; -} - -#define SUFFIX_HIGH 0 -#define SUFFIX_LOW 1 -#define SUFFIX_NULL 2 -static __initdata char *suffix_tbl[] = { - [SUFFIX_HIGH] = ",high", - [SUFFIX_LOW] = ",low", - [SUFFIX_NULL] = NULL, -}; - -/* - * That function parses "suffix" crashkernel command lines like - * - * crashkernel=size,[high|low] - * - * It returns 0 on success and -EINVAL on failure. - */ -static int __init parse_crashkernel_suffix(char *cmdline, - unsigned long long *crash_size, - const char *suffix) -{ - char *cur = cmdline; - - *crash_size = memparse(cmdline, &cur); - if (cmdline == cur) { - pr_warn("crashkernel: memory value expected\n"); - return -EINVAL; - } - - /* check with suffix */ - if (strncmp(cur, suffix, strlen(suffix))) { - pr_warn("crashkernel: unrecognized char: %c\n", *cur); - return -EINVAL; - } - cur += strlen(suffix); - if (*cur != ' ' && *cur != '\0') { - pr_warn("crashkernel: unrecognized char: %c\n", *cur); - return -EINVAL; - } - - return 0; -} - -static __init char *get_last_crashkernel(char *cmdline, - const char *name, - const char *suffix) -{ - char *p = cmdline, *ck_cmdline = NULL; - - /* find crashkernel and use the last one if there are more */ - p = strstr(p, name); - while (p) { - char *end_p = strchr(p, ' '); - char *q; - - if (!end_p) - end_p = p + strlen(p); - - if (!suffix) { - int i; - - /* skip the one with any known suffix */ - for (i = 0; suffix_tbl[i]; i++) { - q = end_p - strlen(suffix_tbl[i]); - if (!strncmp(q, suffix_tbl[i], - strlen(suffix_tbl[i]))) - goto next; - } - ck_cmdline = p; - } else { - q = end_p - strlen(suffix); - if (!strncmp(q, suffix, strlen(suffix))) - ck_cmdline = p; - } -next: - p = strstr(p+1, name); - } - - return ck_cmdline; -} - -static int __init __parse_crashkernel(char *cmdline, - unsigned long long system_ram, - unsigned long long *crash_size, - unsigned long long *crash_base, - const char *suffix) -{ - char *first_colon, *first_space; - char *ck_cmdline; - char *name = "crashkernel="; - - BUG_ON(!crash_size || !crash_base); - *crash_size = 0; - *crash_base = 0; - - ck_cmdline = get_last_crashkernel(cmdline, name, suffix); - if (!ck_cmdline) - return -ENOENT; - - ck_cmdline += strlen(name); - - if (suffix) - return parse_crashkernel_suffix(ck_cmdline, crash_size, - suffix); - /* - * if the commandline contains a ':', then that's the extended - * syntax -- if not, it must be the classic syntax - */ - first_colon = strchr(ck_cmdline, ':'); - first_space = strchr(ck_cmdline, ' '); - if (first_colon && (!first_space || first_colon < first_space)) - return parse_crashkernel_mem(ck_cmdline, system_ram, - crash_size, crash_base); - - return parse_crashkernel_simple(ck_cmdline, crash_size, crash_base); -} - -/* - * That function is the entry point for command line parsing and should be - * called from the arch-specific code. - * - * If crashkernel=,high|low is supported on architecture, non-NULL values - * should be passed to parameters 'low_size' and 'high'. - */ -int __init parse_crashkernel(char *cmdline, - unsigned long long system_ram, - unsigned long long *crash_size, - unsigned long long *crash_base, - unsigned long long *low_size, - bool *high) -{ - int ret; - - /* crashkernel=X[@offset] */ - ret = __parse_crashkernel(cmdline, system_ram, crash_size, - crash_base, NULL); -#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION - /* - * If non-NULL 'high' passed in and no normal crashkernel - * setting detected, try parsing crashkernel=,high|low. - */ - if (high && ret == -ENOENT) { - ret = __parse_crashkernel(cmdline, 0, crash_size, - crash_base, suffix_tbl[SUFFIX_HIGH]); - if (ret || !*crash_size) - return -EINVAL; - - /* - * crashkernel=Y,low can be specified or not, but invalid value - * is not allowed. - */ - ret = __parse_crashkernel(cmdline, 0, low_size, - crash_base, suffix_tbl[SUFFIX_LOW]); - if (ret == -ENOENT) { - *low_size = DEFAULT_CRASH_KERNEL_LOW_SIZE; - ret = 0; - } else if (ret) { - return ret; - } - - *high = true; - } -#endif - if (!*crash_size) - ret = -EINVAL; - - return ret; -} - -/* - * Add a dummy early_param handler to mark crashkernel= as a known command line - * parameter and suppress incorrect warnings in init/main.c. - */ -static int __init parse_crashkernel_dummy(char *arg) -{ - return 0; -} -early_param("crashkernel", parse_crashkernel_dummy); - -#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION -static int __init reserve_crashkernel_low(unsigned long long low_size) -{ -#ifdef CONFIG_64BIT - unsigned long long low_base; - - low_base = memblock_phys_alloc_range(low_size, CRASH_ALIGN, 0, CRASH_ADDR_LOW_MAX); - if (!low_base) { - pr_err("cannot allocate crashkernel low memory (size:0x%llx).\n", low_size); - return -ENOMEM; - } - - pr_info("crashkernel low memory reserved: 0x%08llx - 0x%08llx (%lld MB)\n", - low_base, low_base + low_size, low_size >> 20); - - crashk_low_res.start = low_base; - crashk_low_res.end = low_base + low_size - 1; -#endif - return 0; -} - -void __init reserve_crashkernel_generic(char *cmdline, - unsigned long long crash_size, - unsigned long long crash_base, - unsigned long long crash_low_size, - bool high) -{ - unsigned long long search_end = CRASH_ADDR_LOW_MAX, search_base = 0; - bool fixed_base = false; - - /* User specifies base address explicitly. */ - if (crash_base) { - fixed_base = true; - search_base = crash_base; - search_end = crash_base + crash_size; - } else if (high) { - search_base = CRASH_ADDR_LOW_MAX; - search_end = CRASH_ADDR_HIGH_MAX; - } - -retry: - crash_base = memblock_phys_alloc_range(crash_size, CRASH_ALIGN, - search_base, search_end); - if (!crash_base) { - /* - * For crashkernel=size[KMG]@offset[KMG], print out failure - * message if can't reserve the specified region. - */ - if (fixed_base) { - pr_warn("crashkernel reservation failed - memory is in use.\n"); - return; - } - - /* - * For crashkernel=size[KMG], if the first attempt was for - * low memory, fall back to high memory, the minimum required - * low memory will be reserved later. - */ - if (!high && search_end == CRASH_ADDR_LOW_MAX) { - search_end = CRASH_ADDR_HIGH_MAX; - search_base = CRASH_ADDR_LOW_MAX; - crash_low_size = DEFAULT_CRASH_KERNEL_LOW_SIZE; - goto retry; - } - - /* - * For crashkernel=size[KMG],high, if the first attempt was - * for high memory, fall back to low memory. - */ - if (high && search_end == CRASH_ADDR_HIGH_MAX) { - search_end = CRASH_ADDR_LOW_MAX; - search_base = 0; - goto retry; - } - pr_warn("cannot allocate crashkernel (size:0x%llx)\n", - crash_size); - return; - } - - if ((crash_base >= CRASH_ADDR_LOW_MAX) && - crash_low_size && reserve_crashkernel_low(crash_low_size)) { - memblock_phys_free(crash_base, crash_size); - return; - } - - pr_info("crashkernel reserved: 0x%016llx - 0x%016llx (%lld MB)\n", - crash_base, crash_base + crash_size, crash_size >> 20); - - /* - * The crashkernel memory will be removed from the kernel linear - * map. Inform kmemleak so that it won't try to access it. - */ - kmemleak_ignore_phys(crash_base); - if (crashk_low_res.end) - kmemleak_ignore_phys(crashk_low_res.start); - - crashk_res.start = crash_base; - crashk_res.end = crash_base + crash_size - 1; -} - -static __init int insert_crashkernel_resources(void) -{ - if (crashk_res.start < crashk_res.end) - insert_resource(&iomem_resource, &crashk_res); - - if (crashk_low_res.start < crashk_low_res.end) - insert_resource(&iomem_resource, &crashk_low_res); - - return 0; -} -early_initcall(insert_crashkernel_resources); -#endif - int crash_prepare_elf64_headers(struct crash_mem *mem, int need_kernel_map, void **addr, unsigned long *sz) { diff --git a/kernel/crash_reserve.c b/kernel/crash_reserve.c new file mode 100644 index 00000000000000..bbb6c3cb00e460 --- /dev/null +++ b/kernel/crash_reserve.c @@ -0,0 +1,464 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * crash.c - kernel crash support code. + * Copyright (C) 2002-2004 Eric Biederman + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +#include "kallsyms_internal.h" +#include "kexec_internal.h" + +/* Location of the reserved area for the crash kernel */ +struct resource crashk_res = { + .name = "Crash kernel", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM, + .desc = IORES_DESC_CRASH_KERNEL +}; +struct resource crashk_low_res = { + .name = "Crash kernel", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM, + .desc = IORES_DESC_CRASH_KERNEL +}; + +/* + * parsing the "crashkernel" commandline + * + * this code is intended to be called from architecture specific code + */ + + +/* + * This function parses command lines in the format + * + * crashkernel=ramsize-range:size[,...][@offset] + * + * The function returns 0 on success and -EINVAL on failure. + */ +static int __init parse_crashkernel_mem(char *cmdline, + unsigned long long system_ram, + unsigned long long *crash_size, + unsigned long long *crash_base) +{ + char *cur = cmdline, *tmp; + unsigned long long total_mem = system_ram; + + /* + * Firmware sometimes reserves some memory regions for its own use, + * so the system memory size is less than the actual physical memory + * size. Work around this by rounding up the total size to 128M, + * which is enough for most test cases. + */ + total_mem = roundup(total_mem, SZ_128M); + + /* for each entry of the comma-separated list */ + do { + unsigned long long start, end = ULLONG_MAX, size; + + /* get the start of the range */ + start = memparse(cur, &tmp); + if (cur == tmp) { + pr_warn("crashkernel: Memory value expected\n"); + return -EINVAL; + } + cur = tmp; + if (*cur != '-') { + pr_warn("crashkernel: '-' expected\n"); + return -EINVAL; + } + cur++; + + /* if no ':' is here, than we read the end */ + if (*cur != ':') { + end = memparse(cur, &tmp); + if (cur == tmp) { + pr_warn("crashkernel: Memory value expected\n"); + return -EINVAL; + } + cur = tmp; + if (end <= start) { + pr_warn("crashkernel: end <= start\n"); + return -EINVAL; + } + } + + if (*cur != ':') { + pr_warn("crashkernel: ':' expected\n"); + return -EINVAL; + } + cur++; + + size = memparse(cur, &tmp); + if (cur == tmp) { + pr_warn("Memory value expected\n"); + return -EINVAL; + } + cur = tmp; + if (size >= total_mem) { + pr_warn("crashkernel: invalid size\n"); + return -EINVAL; + } + + /* match ? */ + if (total_mem >= start && total_mem < end) { + *crash_size = size; + break; + } + } while (*cur++ == ','); + + if (*crash_size > 0) { + while (*cur && *cur != ' ' && *cur != '@') + cur++; + if (*cur == '@') { + cur++; + *crash_base = memparse(cur, &tmp); + if (cur == tmp) { + pr_warn("Memory value expected after '@'\n"); + return -EINVAL; + } + } + } else + pr_info("crashkernel size resulted in zero bytes\n"); + + return 0; +} + +/* + * That function parses "simple" (old) crashkernel command lines like + * + * crashkernel=size[@offset] + * + * It returns 0 on success and -EINVAL on failure. + */ +static int __init parse_crashkernel_simple(char *cmdline, + unsigned long long *crash_size, + unsigned long long *crash_base) +{ + char *cur = cmdline; + + *crash_size = memparse(cmdline, &cur); + if (cmdline == cur) { + pr_warn("crashkernel: memory value expected\n"); + return -EINVAL; + } + + if (*cur == '@') + *crash_base = memparse(cur+1, &cur); + else if (*cur != ' ' && *cur != '\0') { + pr_warn("crashkernel: unrecognized char: %c\n", *cur); + return -EINVAL; + } + + return 0; +} + +#define SUFFIX_HIGH 0 +#define SUFFIX_LOW 1 +#define SUFFIX_NULL 2 +static __initdata char *suffix_tbl[] = { + [SUFFIX_HIGH] = ",high", + [SUFFIX_LOW] = ",low", + [SUFFIX_NULL] = NULL, +}; + +/* + * That function parses "suffix" crashkernel command lines like + * + * crashkernel=size,[high|low] + * + * It returns 0 on success and -EINVAL on failure. + */ +static int __init parse_crashkernel_suffix(char *cmdline, + unsigned long long *crash_size, + const char *suffix) +{ + char *cur = cmdline; + + *crash_size = memparse(cmdline, &cur); + if (cmdline == cur) { + pr_warn("crashkernel: memory value expected\n"); + return -EINVAL; + } + + /* check with suffix */ + if (strncmp(cur, suffix, strlen(suffix))) { + pr_warn("crashkernel: unrecognized char: %c\n", *cur); + return -EINVAL; + } + cur += strlen(suffix); + if (*cur != ' ' && *cur != '\0') { + pr_warn("crashkernel: unrecognized char: %c\n", *cur); + return -EINVAL; + } + + return 0; +} + +static __init char *get_last_crashkernel(char *cmdline, + const char *name, + const char *suffix) +{ + char *p = cmdline, *ck_cmdline = NULL; + + /* find crashkernel and use the last one if there are more */ + p = strstr(p, name); + while (p) { + char *end_p = strchr(p, ' '); + char *q; + + if (!end_p) + end_p = p + strlen(p); + + if (!suffix) { + int i; + + /* skip the one with any known suffix */ + for (i = 0; suffix_tbl[i]; i++) { + q = end_p - strlen(suffix_tbl[i]); + if (!strncmp(q, suffix_tbl[i], + strlen(suffix_tbl[i]))) + goto next; + } + ck_cmdline = p; + } else { + q = end_p - strlen(suffix); + if (!strncmp(q, suffix, strlen(suffix))) + ck_cmdline = p; + } +next: + p = strstr(p+1, name); + } + + return ck_cmdline; +} + +static int __init __parse_crashkernel(char *cmdline, + unsigned long long system_ram, + unsigned long long *crash_size, + unsigned long long *crash_base, + const char *suffix) +{ + char *first_colon, *first_space; + char *ck_cmdline; + char *name = "crashkernel="; + + BUG_ON(!crash_size || !crash_base); + *crash_size = 0; + *crash_base = 0; + + ck_cmdline = get_last_crashkernel(cmdline, name, suffix); + if (!ck_cmdline) + return -ENOENT; + + ck_cmdline += strlen(name); + + if (suffix) + return parse_crashkernel_suffix(ck_cmdline, crash_size, + suffix); + /* + * if the commandline contains a ':', then that's the extended + * syntax -- if not, it must be the classic syntax + */ + first_colon = strchr(ck_cmdline, ':'); + first_space = strchr(ck_cmdline, ' '); + if (first_colon && (!first_space || first_colon < first_space)) + return parse_crashkernel_mem(ck_cmdline, system_ram, + crash_size, crash_base); + + return parse_crashkernel_simple(ck_cmdline, crash_size, crash_base); +} + +/* + * That function is the entry point for command line parsing and should be + * called from the arch-specific code. + * + * If crashkernel=,high|low is supported on architecture, non-NULL values + * should be passed to parameters 'low_size' and 'high'. + */ +int __init parse_crashkernel(char *cmdline, + unsigned long long system_ram, + unsigned long long *crash_size, + unsigned long long *crash_base, + unsigned long long *low_size, + bool *high) +{ + int ret; + + /* crashkernel=X[@offset] */ + ret = __parse_crashkernel(cmdline, system_ram, crash_size, + crash_base, NULL); +#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION + /* + * If non-NULL 'high' passed in and no normal crashkernel + * setting detected, try parsing crashkernel=,high|low. + */ + if (high && ret == -ENOENT) { + ret = __parse_crashkernel(cmdline, 0, crash_size, + crash_base, suffix_tbl[SUFFIX_HIGH]); + if (ret || !*crash_size) + return -EINVAL; + + /* + * crashkernel=Y,low can be specified or not, but invalid value + * is not allowed. + */ + ret = __parse_crashkernel(cmdline, 0, low_size, + crash_base, suffix_tbl[SUFFIX_LOW]); + if (ret == -ENOENT) { + *low_size = DEFAULT_CRASH_KERNEL_LOW_SIZE; + ret = 0; + } else if (ret) { + return ret; + } + + *high = true; + } +#endif + if (!*crash_size) + ret = -EINVAL; + + return ret; +} + +/* + * Add a dummy early_param handler to mark crashkernel= as a known command line + * parameter and suppress incorrect warnings in init/main.c. + */ +static int __init parse_crashkernel_dummy(char *arg) +{ + return 0; +} +early_param("crashkernel", parse_crashkernel_dummy); + +#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION +static int __init reserve_crashkernel_low(unsigned long long low_size) +{ +#ifdef CONFIG_64BIT + unsigned long long low_base; + + low_base = memblock_phys_alloc_range(low_size, CRASH_ALIGN, 0, CRASH_ADDR_LOW_MAX); + if (!low_base) { + pr_err("cannot allocate crashkernel low memory (size:0x%llx).\n", low_size); + return -ENOMEM; + } + + pr_info("crashkernel low memory reserved: 0x%08llx - 0x%08llx (%lld MB)\n", + low_base, low_base + low_size, low_size >> 20); + + crashk_low_res.start = low_base; + crashk_low_res.end = low_base + low_size - 1; + insert_resource(&iomem_resource, &crashk_low_res); +#endif + return 0; +} + +void __init reserve_crashkernel_generic(char *cmdline, + unsigned long long crash_size, + unsigned long long crash_base, + unsigned long long crash_low_size, + bool high) +{ + unsigned long long search_end = CRASH_ADDR_LOW_MAX, search_base = 0; + bool fixed_base = false; + + /* User specifies base address explicitly. */ + if (crash_base) { + fixed_base = true; + search_base = crash_base; + search_end = crash_base + crash_size; + } else if (high) { + search_base = CRASH_ADDR_LOW_MAX; + search_end = CRASH_ADDR_HIGH_MAX; + } + +retry: + crash_base = memblock_phys_alloc_range(crash_size, CRASH_ALIGN, + search_base, search_end); + if (!crash_base) { + /* + * For crashkernel=size[KMG]@offset[KMG], print out failure + * message if can't reserve the specified region. + */ + if (fixed_base) { + pr_warn("crashkernel reservation failed - memory is in use.\n"); + return; + } + + /* + * For crashkernel=size[KMG], if the first attempt was for + * low memory, fall back to high memory, the minimum required + * low memory will be reserved later. + */ + if (!high && search_end == CRASH_ADDR_LOW_MAX) { + search_end = CRASH_ADDR_HIGH_MAX; + search_base = CRASH_ADDR_LOW_MAX; + crash_low_size = DEFAULT_CRASH_KERNEL_LOW_SIZE; + goto retry; + } + + /* + * For crashkernel=size[KMG],high, if the first attempt was + * for high memory, fall back to low memory. + */ + if (high && search_end == CRASH_ADDR_HIGH_MAX) { + search_end = CRASH_ADDR_LOW_MAX; + search_base = 0; + goto retry; + } + pr_warn("cannot allocate crashkernel (size:0x%llx)\n", + crash_size); + return; + } + + if ((crash_base >= CRASH_ADDR_LOW_MAX) && + crash_low_size && reserve_crashkernel_low(crash_low_size)) { + memblock_phys_free(crash_base, crash_size); + return; + } + + pr_info("crashkernel reserved: 0x%016llx - 0x%016llx (%lld MB)\n", + crash_base, crash_base + crash_size, crash_size >> 20); + + /* + * The crashkernel memory will be removed from the kernel linear + * map. Inform kmemleak so that it won't try to access it. + */ + kmemleak_ignore_phys(crash_base); + if (crashk_low_res.end) + kmemleak_ignore_phys(crashk_low_res.start); + + crashk_res.start = crash_base; + crashk_res.end = crash_base + crash_size - 1; +} + +static __init int insert_crashkernel_resources(void) +{ + if (crashk_res.start < crashk_res.end) + insert_resource(&iomem_resource, &crashk_res); + + if (crashk_low_res.start < crashk_low_res.end) + insert_resource(&iomem_resource, &crashk_low_res); + + return 0; +} +early_initcall(insert_crashkernel_resources); +#endif From b2044843b278023d73c328cdbf9c906e224a8003 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 28 Jan 2024 22:00:15 -0800 Subject: [PATCH 470/707] kexec-split-crashkernel-reservation-code-out-from-crash_corec-fix s/CRASH_RESEERVE/CRASH_RESERVE/, per Klara Modin Cc: Al Viro Cc: Baoquan He Cc: Eric W. Biederman Cc: Hari Bathini Cc: Pingfan Liu Cc: Klara Modin Signed-off-by: Andrew Morton --- arch/x86/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 71417c5b228c51..5bd9258151546e 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2106,7 +2106,7 @@ config ARCH_SUPPORTS_CRASH_HOTPLUG def_bool y config ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION - def_bool CRASH_RESEERVE + def_bool CRASH_RESERVE config PHYSICAL_START hex "Physical address where the kernel is loaded" if (EXPERT || CRASH_DUMP) From 6516ae2baeb5c04b3edd13020383df8c1a478095 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Wed, 24 Jan 2024 13:12:42 +0800 Subject: [PATCH 471/707] crash: split vmcoreinfo exporting code out from crash_core.c Now move the relevant codes into separate files: kernel/crash_reserve.c, include/linux/crash_reserve.h. And add config item CRASH_RESERVE to control its enabling. And also update the old ifdeffery of CONFIG_CRASH_CORE, including of and config item dependency on CRASH_CORE accordingly. And also do renaming as follows: - arch/xxx/kernel/{crash_core.c => vmcore_info.c} because they are only related to vmcoreinfo exporting on x86, arm64, riscv. And also Remove config item CRASH_CORE, and rely on CONFIG_KEXEC_CORE to decide if build in crash_core.c. Link: https://lkml.kernel.org/r/20240124051254.67105-3-bhe@redhat.com Signed-off-by: Baoquan He Cc: Al Viro Cc: Eric W. Biederman Cc: Hari Bathini Cc: Pingfan Liu Signed-off-by: Andrew Morton --- arch/arm64/kernel/Makefile | 2 +- .../kernel/{crash_core.c => vmcore_info.c} | 2 +- arch/powerpc/Kconfig | 2 +- arch/powerpc/kernel/setup-common.c | 2 +- arch/powerpc/platforms/powernv/opal-core.c | 2 +- arch/riscv/kernel/Makefile | 2 +- .../kernel/{crash_core.c => vmcore_info.c} | 2 +- arch/x86/kernel/Makefile | 2 +- .../{crash_core_32.c => vmcore_info_32.c} | 2 +- .../{crash_core_64.c => vmcore_info_64.c} | 2 +- drivers/firmware/qemu_fw_cfg.c | 14 +- fs/proc/Kconfig | 2 +- fs/proc/kcore.c | 2 +- include/linux/buildid.h | 2 +- include/linux/crash_core.h | 73 ------ include/linux/kexec.h | 1 + include/linux/vmcore_info.h | 81 ++++++ kernel/Kconfig.kexec | 4 +- kernel/Makefile | 4 +- kernel/crash_core.c | 206 ---------------- kernel/ksysfs.c | 6 +- kernel/printk/printk.c | 4 +- kernel/vmcore_info.c | 231 ++++++++++++++++++ lib/buildid.c | 2 +- 24 files changed, 343 insertions(+), 309 deletions(-) rename arch/arm64/kernel/{crash_core.c => vmcore_info.c} (97%) rename arch/riscv/kernel/{crash_core.c => vmcore_info.c} (96%) rename arch/x86/kernel/{crash_core_32.c => vmcore_info_32.c} (90%) rename arch/x86/kernel/{crash_core_64.c => vmcore_info_64.c} (94%) create mode 100644 include/linux/vmcore_info.h create mode 100644 kernel/vmcore_info.c diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile index e5d03a7039b4bf..9fd638b91f38d0 100644 --- a/arch/arm64/kernel/Makefile +++ b/arch/arm64/kernel/Makefile @@ -66,7 +66,7 @@ obj-$(CONFIG_KEXEC_FILE) += machine_kexec_file.o kexec_image.o obj-$(CONFIG_ARM64_RELOC_TEST) += arm64-reloc-test.o arm64-reloc-test-y := reloc_test_core.o reloc_test_syms.o obj-$(CONFIG_CRASH_DUMP) += crash_dump.o -obj-$(CONFIG_CRASH_CORE) += crash_core.o +obj-$(CONFIG_VMCORE_INFO) += vmcore_info.o obj-$(CONFIG_ARM_SDE_INTERFACE) += sdei.o obj-$(CONFIG_ARM64_PTR_AUTH) += pointer_auth.o obj-$(CONFIG_ARM64_MTE) += mte.o diff --git a/arch/arm64/kernel/crash_core.c b/arch/arm64/kernel/vmcore_info.c similarity index 97% rename from arch/arm64/kernel/crash_core.c rename to arch/arm64/kernel/vmcore_info.c index 2a24199a9b81e0..b19d5d6cb8b387 100644 --- a/arch/arm64/kernel/crash_core.c +++ b/arch/arm64/kernel/vmcore_info.c @@ -4,7 +4,7 @@ * Copyright (C) Huawei Futurewei Technologies. */ -#include +#include #include #include #include diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 7f704ae5c5efcb..495d197c9b2751 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -690,7 +690,7 @@ config ARCH_SELECTS_CRASH_DUMP config FA_DUMP bool "Firmware-assisted dump" depends on PPC64 && (PPC_RTAS || PPC_POWERNV) - select CRASH_CORE + select VMCORE_INFO select CRASH_RESERVE select CRASH_DUMP help diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index 9b142b9d5187b2..733f210ffda1fe 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -109,7 +109,7 @@ int ppc_do_canonicalize_irqs; EXPORT_SYMBOL(ppc_do_canonicalize_irqs); #endif -#ifdef CONFIG_CRASH_CORE +#ifdef CONFIG_VMCORE_INFO /* This keeps a track of which one is the crashing cpu. */ int crashing_cpu = -1; #endif diff --git a/arch/powerpc/platforms/powernv/opal-core.c b/arch/powerpc/platforms/powernv/opal-core.c index bb7657115f1d27..c9a9b759cc928b 100644 --- a/arch/powerpc/platforms/powernv/opal-core.c +++ b/arch/powerpc/platforms/powernv/opal-core.c @@ -16,7 +16,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile index f71910718053d8..d6fd8dcfceb5e3 100644 --- a/arch/riscv/kernel/Makefile +++ b/arch/riscv/kernel/Makefile @@ -92,7 +92,7 @@ obj-$(CONFIG_KGDB) += kgdb.o obj-$(CONFIG_KEXEC_CORE) += kexec_relocate.o crash_save_regs.o machine_kexec.o obj-$(CONFIG_KEXEC_FILE) += elf_kexec.o machine_kexec_file.o obj-$(CONFIG_CRASH_DUMP) += crash_dump.o -obj-$(CONFIG_CRASH_CORE) += crash_core.o +obj-$(CONFIG_VMCORE_INFO) += vmcore_info.o obj-$(CONFIG_JUMP_LABEL) += jump_label.o diff --git a/arch/riscv/kernel/crash_core.c b/arch/riscv/kernel/vmcore_info.c similarity index 96% rename from arch/riscv/kernel/crash_core.c rename to arch/riscv/kernel/vmcore_info.c index d18d529fd9b984..6d7a22522d6309 100644 --- a/arch/riscv/kernel/crash_core.c +++ b/arch/riscv/kernel/vmcore_info.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only -#include +#include #include void arch_crash_save_vmcoreinfo(void) diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 0000325ab98f4d..913d4022131eba 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -98,7 +98,7 @@ obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o obj-$(CONFIG_X86_TSC) += trace_clock.o obj-$(CONFIG_TRACING) += trace.o obj-$(CONFIG_RETHOOK) += rethook.o -obj-$(CONFIG_CRASH_CORE) += crash_core_$(BITS).o +obj-$(CONFIG_VMCORE_INFO) += vmcore_info_$(BITS).o obj-$(CONFIG_KEXEC_CORE) += machine_kexec_$(BITS).o obj-$(CONFIG_KEXEC_CORE) += relocate_kernel_$(BITS).o crash.o obj-$(CONFIG_KEXEC_FILE) += kexec-bzimage64.o diff --git a/arch/x86/kernel/crash_core_32.c b/arch/x86/kernel/vmcore_info_32.c similarity index 90% rename from arch/x86/kernel/crash_core_32.c rename to arch/x86/kernel/vmcore_info_32.c index 8a89c109e20a6c..5995a749288a95 100644 --- a/arch/x86/kernel/crash_core_32.c +++ b/arch/x86/kernel/vmcore_info_32.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only -#include +#include #include #include diff --git a/arch/x86/kernel/crash_core_64.c b/arch/x86/kernel/vmcore_info_64.c similarity index 94% rename from arch/x86/kernel/crash_core_64.c rename to arch/x86/kernel/vmcore_info_64.c index 7d255f882afe6f..0dec7d86875447 100644 --- a/arch/x86/kernel/crash_core_64.c +++ b/arch/x86/kernel/vmcore_info_64.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only -#include +#include #include #include diff --git a/drivers/firmware/qemu_fw_cfg.c b/drivers/firmware/qemu_fw_cfg.c index 03da9a4354f886..5f43dfa22f799c 100644 --- a/drivers/firmware/qemu_fw_cfg.c +++ b/drivers/firmware/qemu_fw_cfg.c @@ -37,7 +37,7 @@ #include #include #include -#include +#include MODULE_AUTHOR("Gabriel L. Somlo "); MODULE_DESCRIPTION("QEMU fw_cfg sysfs support"); @@ -67,7 +67,7 @@ static void fw_cfg_sel_endianness(u16 key) iowrite16(key, fw_cfg_reg_ctrl); } -#ifdef CONFIG_CRASH_CORE +#ifdef CONFIG_VMCORE_INFO static inline bool fw_cfg_dma_enabled(void) { return (fw_cfg_rev & FW_CFG_VERSION_DMA) && fw_cfg_reg_dma; @@ -156,7 +156,7 @@ static ssize_t fw_cfg_read_blob(u16 key, return count; } -#ifdef CONFIG_CRASH_CORE +#ifdef CONFIG_VMCORE_INFO /* write chunk of given fw_cfg blob (caller responsible for sanity-check) */ static ssize_t fw_cfg_write_blob(u16 key, void *buf, loff_t pos, size_t count) @@ -195,7 +195,7 @@ static ssize_t fw_cfg_write_blob(u16 key, return ret; } -#endif /* CONFIG_CRASH_CORE */ +#endif /* CONFIG_VMCORE_INFO */ /* clean up fw_cfg device i/o */ static void fw_cfg_io_cleanup(void) @@ -319,7 +319,7 @@ struct fw_cfg_sysfs_entry { struct list_head list; }; -#ifdef CONFIG_CRASH_CORE +#ifdef CONFIG_VMCORE_INFO static ssize_t fw_cfg_write_vmcoreinfo(const struct fw_cfg_file *f) { static struct fw_cfg_vmcoreinfo *data; @@ -343,7 +343,7 @@ static ssize_t fw_cfg_write_vmcoreinfo(const struct fw_cfg_file *f) kfree(data); return ret; } -#endif /* CONFIG_CRASH_CORE */ +#endif /* CONFIG_VMCORE_INFO */ /* get fw_cfg_sysfs_entry from kobject member */ static inline struct fw_cfg_sysfs_entry *to_entry(struct kobject *kobj) @@ -583,7 +583,7 @@ static int fw_cfg_register_file(const struct fw_cfg_file *f) int err; struct fw_cfg_sysfs_entry *entry; -#ifdef CONFIG_CRASH_CORE +#ifdef CONFIG_VMCORE_INFO if (fw_cfg_dma_enabled() && strcmp(f->name, FW_CFG_VMCOREINFO_FILENAME) == 0 && !is_kdump_kernel()) { diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig index 32b1116ae137c6..d80a1431ef7be0 100644 --- a/fs/proc/Kconfig +++ b/fs/proc/Kconfig @@ -32,7 +32,7 @@ config PROC_FS config PROC_KCORE bool "/proc/kcore support" if !ARM depends on PROC_FS && MMU - select CRASH_CORE + select VMCORE_INFO help Provides a virtual ELF core file of the live kernel. This can be read with gdb and other ELF tools. No modifications can be diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 6422e569b08085..8e08a9a1b7ed57 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -10,7 +10,7 @@ * Safe accesses to vmalloc/direct-mapped discontiguous areas, Kanoj Sarcar */ -#include +#include #include #include #include diff --git a/include/linux/buildid.h b/include/linux/buildid.h index 8a582d242f0672..20aa3c2d89f760 100644 --- a/include/linux/buildid.h +++ b/include/linux/buildid.h @@ -11,7 +11,7 @@ int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id, __u32 *size); int build_id_parse_buf(const void *buf, unsigned char *build_id, u32 buf_size); -#if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID) || IS_ENABLED(CONFIG_CRASH_CORE) +#if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID) || IS_ENABLED(CONFIG_VMCORE_INFO) extern unsigned char vmlinux_build_id[BUILD_ID_SIZE_MAX]; void init_vmlinux_build_id(void); #else diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h index 1fde49246fa6e3..7f19f62018ef9c 100644 --- a/include/linux/crash_core.h +++ b/include/linux/crash_core.h @@ -6,79 +6,6 @@ #include #include -#define CRASH_CORE_NOTE_NAME "CORE" -#define CRASH_CORE_NOTE_HEAD_BYTES ALIGN(sizeof(struct elf_note), 4) -#define CRASH_CORE_NOTE_NAME_BYTES ALIGN(sizeof(CRASH_CORE_NOTE_NAME), 4) -#define CRASH_CORE_NOTE_DESC_BYTES ALIGN(sizeof(struct elf_prstatus), 4) - -/* - * The per-cpu notes area is a list of notes terminated by a "NULL" - * note header. For kdump, the code in vmcore.c runs in the context - * of the second kernel to combine them into one note. - */ -#define CRASH_CORE_NOTE_BYTES ((CRASH_CORE_NOTE_HEAD_BYTES * 2) + \ - CRASH_CORE_NOTE_NAME_BYTES + \ - CRASH_CORE_NOTE_DESC_BYTES) - -#define VMCOREINFO_BYTES PAGE_SIZE -#define VMCOREINFO_NOTE_NAME "VMCOREINFO" -#define VMCOREINFO_NOTE_NAME_BYTES ALIGN(sizeof(VMCOREINFO_NOTE_NAME), 4) -#define VMCOREINFO_NOTE_SIZE ((CRASH_CORE_NOTE_HEAD_BYTES * 2) + \ - VMCOREINFO_NOTE_NAME_BYTES + \ - VMCOREINFO_BYTES) - -typedef u32 note_buf_t[CRASH_CORE_NOTE_BYTES/4]; -/* Per cpu memory for storing cpu states in case of system crash. */ -extern note_buf_t __percpu *crash_notes; - -void crash_update_vmcoreinfo_safecopy(void *ptr); -void crash_save_vmcoreinfo(void); -void arch_crash_save_vmcoreinfo(void); -__printf(1, 2) -void vmcoreinfo_append_str(const char *fmt, ...); -phys_addr_t paddr_vmcoreinfo_note(void); - -#define VMCOREINFO_OSRELEASE(value) \ - vmcoreinfo_append_str("OSRELEASE=%s\n", value) -#define VMCOREINFO_BUILD_ID() \ - ({ \ - static_assert(sizeof(vmlinux_build_id) == 20); \ - vmcoreinfo_append_str("BUILD-ID=%20phN\n", vmlinux_build_id); \ - }) - -#define VMCOREINFO_PAGESIZE(value) \ - vmcoreinfo_append_str("PAGESIZE=%ld\n", value) -#define VMCOREINFO_SYMBOL(name) \ - vmcoreinfo_append_str("SYMBOL(%s)=%lx\n", #name, (unsigned long)&name) -#define VMCOREINFO_SYMBOL_ARRAY(name) \ - vmcoreinfo_append_str("SYMBOL(%s)=%lx\n", #name, (unsigned long)name) -#define VMCOREINFO_SIZE(name) \ - vmcoreinfo_append_str("SIZE(%s)=%lu\n", #name, \ - (unsigned long)sizeof(name)) -#define VMCOREINFO_STRUCT_SIZE(name) \ - vmcoreinfo_append_str("SIZE(%s)=%lu\n", #name, \ - (unsigned long)sizeof(struct name)) -#define VMCOREINFO_OFFSET(name, field) \ - vmcoreinfo_append_str("OFFSET(%s.%s)=%lu\n", #name, #field, \ - (unsigned long)offsetof(struct name, field)) -#define VMCOREINFO_TYPE_OFFSET(name, field) \ - vmcoreinfo_append_str("OFFSET(%s.%s)=%lu\n", #name, #field, \ - (unsigned long)offsetof(name, field)) -#define VMCOREINFO_LENGTH(name, value) \ - vmcoreinfo_append_str("LENGTH(%s)=%lu\n", #name, (unsigned long)value) -#define VMCOREINFO_NUMBER(name) \ - vmcoreinfo_append_str("NUMBER(%s)=%ld\n", #name, (long)name) -#define VMCOREINFO_CONFIG(name) \ - vmcoreinfo_append_str("CONFIG_%s=y\n", #name) - -extern unsigned char *vmcoreinfo_data; -extern size_t vmcoreinfo_size; -extern u32 *vmcoreinfo_note; - -Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type, - void *data, size_t data_len); -void final_note(Elf_Word *buf); - /* Alignment required for elf header segment */ #define ELF_CORE_HEADER_ALIGN 4096 diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 6d79bfb52e5bf0..9c7bb8b56ed66d 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -16,6 +16,7 @@ #if !defined(__ASSEMBLY__) #include +#include #include #include #include diff --git a/include/linux/vmcore_info.h b/include/linux/vmcore_info.h new file mode 100644 index 00000000000000..e1dec1a6a749dc --- /dev/null +++ b/include/linux/vmcore_info.h @@ -0,0 +1,81 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef LINUX_VMCORE_INFO_H +#define LINUX_VMCORE_INFO_H + +#include +#include +#include + +#define CRASH_CORE_NOTE_NAME "CORE" +#define CRASH_CORE_NOTE_HEAD_BYTES ALIGN(sizeof(struct elf_note), 4) +#define CRASH_CORE_NOTE_NAME_BYTES ALIGN(sizeof(CRASH_CORE_NOTE_NAME), 4) +#define CRASH_CORE_NOTE_DESC_BYTES ALIGN(sizeof(struct elf_prstatus), 4) + +/* + * The per-cpu notes area is a list of notes terminated by a "NULL" + * note header. For kdump, the code in vmcore.c runs in the context + * of the second kernel to combine them into one note. + */ +#define CRASH_CORE_NOTE_BYTES ((CRASH_CORE_NOTE_HEAD_BYTES * 2) + \ + CRASH_CORE_NOTE_NAME_BYTES + \ + CRASH_CORE_NOTE_DESC_BYTES) + +#define VMCOREINFO_BYTES PAGE_SIZE +#define VMCOREINFO_NOTE_NAME "VMCOREINFO" +#define VMCOREINFO_NOTE_NAME_BYTES ALIGN(sizeof(VMCOREINFO_NOTE_NAME), 4) +#define VMCOREINFO_NOTE_SIZE ((CRASH_CORE_NOTE_HEAD_BYTES * 2) + \ + VMCOREINFO_NOTE_NAME_BYTES + \ + VMCOREINFO_BYTES) + +typedef u32 note_buf_t[CRASH_CORE_NOTE_BYTES/4]; +/* Per cpu memory for storing cpu states in case of system crash. */ +extern note_buf_t __percpu *crash_notes; + +void crash_update_vmcoreinfo_safecopy(void *ptr); +void crash_save_vmcoreinfo(void); +void arch_crash_save_vmcoreinfo(void); +__printf(1, 2) +void vmcoreinfo_append_str(const char *fmt, ...); +phys_addr_t paddr_vmcoreinfo_note(void); + +#define VMCOREINFO_OSRELEASE(value) \ + vmcoreinfo_append_str("OSRELEASE=%s\n", value) +#define VMCOREINFO_BUILD_ID() \ + ({ \ + static_assert(sizeof(vmlinux_build_id) == 20); \ + vmcoreinfo_append_str("BUILD-ID=%20phN\n", vmlinux_build_id); \ + }) + +#define VMCOREINFO_PAGESIZE(value) \ + vmcoreinfo_append_str("PAGESIZE=%ld\n", value) +#define VMCOREINFO_SYMBOL(name) \ + vmcoreinfo_append_str("SYMBOL(%s)=%lx\n", #name, (unsigned long)&name) +#define VMCOREINFO_SYMBOL_ARRAY(name) \ + vmcoreinfo_append_str("SYMBOL(%s)=%lx\n", #name, (unsigned long)name) +#define VMCOREINFO_SIZE(name) \ + vmcoreinfo_append_str("SIZE(%s)=%lu\n", #name, \ + (unsigned long)sizeof(name)) +#define VMCOREINFO_STRUCT_SIZE(name) \ + vmcoreinfo_append_str("SIZE(%s)=%lu\n", #name, \ + (unsigned long)sizeof(struct name)) +#define VMCOREINFO_OFFSET(name, field) \ + vmcoreinfo_append_str("OFFSET(%s.%s)=%lu\n", #name, #field, \ + (unsigned long)offsetof(struct name, field)) +#define VMCOREINFO_TYPE_OFFSET(name, field) \ + vmcoreinfo_append_str("OFFSET(%s.%s)=%lu\n", #name, #field, \ + (unsigned long)offsetof(name, field)) +#define VMCOREINFO_LENGTH(name, value) \ + vmcoreinfo_append_str("LENGTH(%s)=%lu\n", #name, (unsigned long)value) +#define VMCOREINFO_NUMBER(name) \ + vmcoreinfo_append_str("NUMBER(%s)=%ld\n", #name, (long)name) +#define VMCOREINFO_CONFIG(name) \ + vmcoreinfo_append_str("CONFIG_%s=y\n", #name) + +extern unsigned char *vmcoreinfo_data; +extern size_t vmcoreinfo_size; +extern u32 *vmcoreinfo_note; + +Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type, + void *data, size_t data_len); +void final_note(Elf_Word *buf); +#endif /* LINUX_VMCORE_INFO_H */ diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec index 8b7be71edd859e..8faf27043432fe 100644 --- a/kernel/Kconfig.kexec +++ b/kernel/Kconfig.kexec @@ -5,11 +5,11 @@ menu "Kexec and crash features" config CRASH_RESERVE bool -config CRASH_CORE +config VMCORE_INFO bool config KEXEC_CORE - select CRASH_CORE + select VMCORE_INFO select CRASH_RESERVE bool diff --git a/kernel/Makefile b/kernel/Makefile index 05fa88b3ab7499..649272a1d6b9f8 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -68,9 +68,9 @@ obj-$(CONFIG_MODULE_SIG_FORMAT) += module_signature.o obj-$(CONFIG_KALLSYMS) += kallsyms.o obj-$(CONFIG_KALLSYMS_SELFTEST) += kallsyms_selftest.o obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o -obj-$(CONFIG_CRASH_CORE) += crash_core.o +obj-$(CONFIG_VMCORE_INFO) += vmcore_info.o obj-$(CONFIG_CRASH_RESERVE) += crash_reserve.o -obj-$(CONFIG_KEXEC_CORE) += kexec_core.o +obj-$(CONFIG_KEXEC_CORE) += kexec_core.o crash_core.o obj-$(CONFIG_KEXEC) += kexec.o obj-$(CONFIG_KEXEC_FILE) += kexec_file.o obj-$(CONFIG_KEXEC_ELF) += kexec_elf.o diff --git a/kernel/crash_core.c b/kernel/crash_core.c index ae0d1ce89b46b8..2f4df1fe6f7af5 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -26,14 +26,6 @@ /* Per cpu memory for storing cpu states in case of system crash. */ note_buf_t __percpu *crash_notes; -/* vmcoreinfo stuff */ -unsigned char *vmcoreinfo_data; -size_t vmcoreinfo_size; -u32 *vmcoreinfo_note; - -/* trusted vmcoreinfo, e.g. we can make a copy in the crash memory */ -static unsigned char *vmcoreinfo_data_safecopy; - int crash_prepare_elf64_headers(struct crash_mem *mem, int need_kernel_map, void **addr, unsigned long *sz) { @@ -195,204 +187,6 @@ int crash_exclude_mem_range(struct crash_mem *mem, return 0; } -Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type, - void *data, size_t data_len) -{ - struct elf_note *note = (struct elf_note *)buf; - - note->n_namesz = strlen(name) + 1; - note->n_descsz = data_len; - note->n_type = type; - buf += DIV_ROUND_UP(sizeof(*note), sizeof(Elf_Word)); - memcpy(buf, name, note->n_namesz); - buf += DIV_ROUND_UP(note->n_namesz, sizeof(Elf_Word)); - memcpy(buf, data, data_len); - buf += DIV_ROUND_UP(data_len, sizeof(Elf_Word)); - - return buf; -} - -void final_note(Elf_Word *buf) -{ - memset(buf, 0, sizeof(struct elf_note)); -} - -static void update_vmcoreinfo_note(void) -{ - u32 *buf = vmcoreinfo_note; - - if (!vmcoreinfo_size) - return; - buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data, - vmcoreinfo_size); - final_note(buf); -} - -void crash_update_vmcoreinfo_safecopy(void *ptr) -{ - if (ptr) - memcpy(ptr, vmcoreinfo_data, vmcoreinfo_size); - - vmcoreinfo_data_safecopy = ptr; -} - -void crash_save_vmcoreinfo(void) -{ - if (!vmcoreinfo_note) - return; - - /* Use the safe copy to generate vmcoreinfo note if have */ - if (vmcoreinfo_data_safecopy) - vmcoreinfo_data = vmcoreinfo_data_safecopy; - - vmcoreinfo_append_str("CRASHTIME=%lld\n", ktime_get_real_seconds()); - update_vmcoreinfo_note(); -} - -void vmcoreinfo_append_str(const char *fmt, ...) -{ - va_list args; - char buf[0x50]; - size_t r; - - va_start(args, fmt); - r = vscnprintf(buf, sizeof(buf), fmt, args); - va_end(args); - - r = min(r, (size_t)VMCOREINFO_BYTES - vmcoreinfo_size); - - memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r); - - vmcoreinfo_size += r; - - WARN_ONCE(vmcoreinfo_size == VMCOREINFO_BYTES, - "vmcoreinfo data exceeds allocated size, truncating"); -} - -/* - * provide an empty default implementation here -- architecture - * code may override this - */ -void __weak arch_crash_save_vmcoreinfo(void) -{} - -phys_addr_t __weak paddr_vmcoreinfo_note(void) -{ - return __pa(vmcoreinfo_note); -} -EXPORT_SYMBOL(paddr_vmcoreinfo_note); - -static int __init crash_save_vmcoreinfo_init(void) -{ - vmcoreinfo_data = (unsigned char *)get_zeroed_page(GFP_KERNEL); - if (!vmcoreinfo_data) { - pr_warn("Memory allocation for vmcoreinfo_data failed\n"); - return -ENOMEM; - } - - vmcoreinfo_note = alloc_pages_exact(VMCOREINFO_NOTE_SIZE, - GFP_KERNEL | __GFP_ZERO); - if (!vmcoreinfo_note) { - free_page((unsigned long)vmcoreinfo_data); - vmcoreinfo_data = NULL; - pr_warn("Memory allocation for vmcoreinfo_note failed\n"); - return -ENOMEM; - } - - VMCOREINFO_OSRELEASE(init_uts_ns.name.release); - VMCOREINFO_BUILD_ID(); - VMCOREINFO_PAGESIZE(PAGE_SIZE); - - VMCOREINFO_SYMBOL(init_uts_ns); - VMCOREINFO_OFFSET(uts_namespace, name); - VMCOREINFO_SYMBOL(node_online_map); -#ifdef CONFIG_MMU - VMCOREINFO_SYMBOL_ARRAY(swapper_pg_dir); -#endif - VMCOREINFO_SYMBOL(_stext); - vmcoreinfo_append_str("NUMBER(VMALLOC_START)=0x%lx\n", (unsigned long) VMALLOC_START); - -#ifndef CONFIG_NUMA - VMCOREINFO_SYMBOL(mem_map); - VMCOREINFO_SYMBOL(contig_page_data); -#endif -#ifdef CONFIG_SPARSEMEM - VMCOREINFO_SYMBOL_ARRAY(mem_section); - VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS); - VMCOREINFO_STRUCT_SIZE(mem_section); - VMCOREINFO_OFFSET(mem_section, section_mem_map); - VMCOREINFO_NUMBER(SECTION_SIZE_BITS); - VMCOREINFO_NUMBER(MAX_PHYSMEM_BITS); -#endif - VMCOREINFO_STRUCT_SIZE(page); - VMCOREINFO_STRUCT_SIZE(pglist_data); - VMCOREINFO_STRUCT_SIZE(zone); - VMCOREINFO_STRUCT_SIZE(free_area); - VMCOREINFO_STRUCT_SIZE(list_head); - VMCOREINFO_SIZE(nodemask_t); - VMCOREINFO_OFFSET(page, flags); - VMCOREINFO_OFFSET(page, _refcount); - VMCOREINFO_OFFSET(page, mapping); - VMCOREINFO_OFFSET(page, lru); - VMCOREINFO_OFFSET(page, _mapcount); - VMCOREINFO_OFFSET(page, private); - VMCOREINFO_OFFSET(page, compound_head); - VMCOREINFO_OFFSET(pglist_data, node_zones); - VMCOREINFO_OFFSET(pglist_data, nr_zones); -#ifdef CONFIG_FLATMEM - VMCOREINFO_OFFSET(pglist_data, node_mem_map); -#endif - VMCOREINFO_OFFSET(pglist_data, node_start_pfn); - VMCOREINFO_OFFSET(pglist_data, node_spanned_pages); - VMCOREINFO_OFFSET(pglist_data, node_id); - VMCOREINFO_OFFSET(zone, free_area); - VMCOREINFO_OFFSET(zone, vm_stat); - VMCOREINFO_OFFSET(zone, spanned_pages); - VMCOREINFO_OFFSET(free_area, free_list); - VMCOREINFO_OFFSET(list_head, next); - VMCOREINFO_OFFSET(list_head, prev); - VMCOREINFO_LENGTH(zone.free_area, NR_PAGE_ORDERS); - log_buf_vmcoreinfo_setup(); - VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES); - VMCOREINFO_NUMBER(NR_FREE_PAGES); - VMCOREINFO_NUMBER(PG_lru); - VMCOREINFO_NUMBER(PG_private); - VMCOREINFO_NUMBER(PG_swapcache); - VMCOREINFO_NUMBER(PG_swapbacked); - VMCOREINFO_NUMBER(PG_slab); -#ifdef CONFIG_MEMORY_FAILURE - VMCOREINFO_NUMBER(PG_hwpoison); -#endif - VMCOREINFO_NUMBER(PG_head_mask); -#define PAGE_BUDDY_MAPCOUNT_VALUE (~PG_buddy) - VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE); -#ifdef CONFIG_HUGETLB_PAGE - VMCOREINFO_NUMBER(PG_hugetlb); -#define PAGE_OFFLINE_MAPCOUNT_VALUE (~PG_offline) - VMCOREINFO_NUMBER(PAGE_OFFLINE_MAPCOUNT_VALUE); -#endif - -#ifdef CONFIG_KALLSYMS - VMCOREINFO_SYMBOL(kallsyms_names); - VMCOREINFO_SYMBOL(kallsyms_num_syms); - VMCOREINFO_SYMBOL(kallsyms_token_table); - VMCOREINFO_SYMBOL(kallsyms_token_index); -#ifdef CONFIG_KALLSYMS_BASE_RELATIVE - VMCOREINFO_SYMBOL(kallsyms_offsets); - VMCOREINFO_SYMBOL(kallsyms_relative_base); -#else - VMCOREINFO_SYMBOL(kallsyms_addresses); -#endif /* CONFIG_KALLSYMS_BASE_RELATIVE */ -#endif /* CONFIG_KALLSYMS */ - - arch_crash_save_vmcoreinfo(); - update_vmcoreinfo_note(); - - return 0; -} - -subsys_initcall(crash_save_vmcoreinfo_init); - static int __init crash_notes_memory_init(void) { /* Allocate memory for saving cpu registers. */ diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 1d4bc493b2f4b2..11526fc42bc24c 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -154,7 +154,7 @@ KERNEL_ATTR_RW(kexec_crash_size); #endif /* CONFIG_KEXEC_CORE */ -#ifdef CONFIG_CRASH_CORE +#ifdef CONFIG_VMCORE_INFO static ssize_t vmcoreinfo_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) @@ -177,7 +177,7 @@ KERNEL_ATTR_RO(crash_elfcorehdr_size); #endif -#endif /* CONFIG_CRASH_CORE */ +#endif /* CONFIG_VMCORE_INFO */ /* whether file capabilities are enabled */ static ssize_t fscaps_show(struct kobject *kobj, @@ -265,7 +265,7 @@ static struct attribute * kernel_attrs[] = { &kexec_crash_loaded_attr.attr, &kexec_crash_size_attr.attr, #endif -#ifdef CONFIG_CRASH_CORE +#ifdef CONFIG_VMCORE_INFO &vmcoreinfo_attr.attr, #ifdef CONFIG_CRASH_HOTPLUG &crash_elfcorehdr_size_attr.attr, diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index f2444b581e16c3..7d74b000b43a9b 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -34,7 +34,7 @@ #include #include #include -#include +#include #include #include #include @@ -951,7 +951,7 @@ const struct file_operations kmsg_fops = { .release = devkmsg_release, }; -#ifdef CONFIG_CRASH_CORE +#ifdef CONFIG_VMCORE_INFO /* * This appends the listed symbols to /proc/vmcore * diff --git a/kernel/vmcore_info.c b/kernel/vmcore_info.c new file mode 100644 index 00000000000000..8f48c0a42e2eed --- /dev/null +++ b/kernel/vmcore_info.c @@ -0,0 +1,231 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * crash.c - kernel crash support code. + * Copyright (C) 2002-2004 Eric Biederman + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +#include "kallsyms_internal.h" +#include "kexec_internal.h" + +/* vmcoreinfo stuff */ +unsigned char *vmcoreinfo_data; +size_t vmcoreinfo_size; +u32 *vmcoreinfo_note; + +/* trusted vmcoreinfo, e.g. we can make a copy in the crash memory */ +static unsigned char *vmcoreinfo_data_safecopy; + +Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type, + void *data, size_t data_len) +{ + struct elf_note *note = (struct elf_note *)buf; + + note->n_namesz = strlen(name) + 1; + note->n_descsz = data_len; + note->n_type = type; + buf += DIV_ROUND_UP(sizeof(*note), sizeof(Elf_Word)); + memcpy(buf, name, note->n_namesz); + buf += DIV_ROUND_UP(note->n_namesz, sizeof(Elf_Word)); + memcpy(buf, data, data_len); + buf += DIV_ROUND_UP(data_len, sizeof(Elf_Word)); + + return buf; +} + +void final_note(Elf_Word *buf) +{ + memset(buf, 0, sizeof(struct elf_note)); +} + +static void update_vmcoreinfo_note(void) +{ + u32 *buf = vmcoreinfo_note; + + if (!vmcoreinfo_size) + return; + buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data, + vmcoreinfo_size); + final_note(buf); +} + +void crash_update_vmcoreinfo_safecopy(void *ptr) +{ + if (ptr) + memcpy(ptr, vmcoreinfo_data, vmcoreinfo_size); + + vmcoreinfo_data_safecopy = ptr; +} + +void crash_save_vmcoreinfo(void) +{ + if (!vmcoreinfo_note) + return; + + /* Use the safe copy to generate vmcoreinfo note if have */ + if (vmcoreinfo_data_safecopy) + vmcoreinfo_data = vmcoreinfo_data_safecopy; + + vmcoreinfo_append_str("CRASHTIME=%lld\n", ktime_get_real_seconds()); + update_vmcoreinfo_note(); +} + +void vmcoreinfo_append_str(const char *fmt, ...) +{ + va_list args; + char buf[0x50]; + size_t r; + + va_start(args, fmt); + r = vscnprintf(buf, sizeof(buf), fmt, args); + va_end(args); + + r = min(r, (size_t)VMCOREINFO_BYTES - vmcoreinfo_size); + + memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r); + + vmcoreinfo_size += r; + + WARN_ONCE(vmcoreinfo_size == VMCOREINFO_BYTES, + "vmcoreinfo data exceeds allocated size, truncating"); +} + +/* + * provide an empty default implementation here -- architecture + * code may override this + */ +void __weak arch_crash_save_vmcoreinfo(void) +{} + +phys_addr_t __weak paddr_vmcoreinfo_note(void) +{ + return __pa(vmcoreinfo_note); +} +EXPORT_SYMBOL(paddr_vmcoreinfo_note); + +static int __init crash_save_vmcoreinfo_init(void) +{ + vmcoreinfo_data = (unsigned char *)get_zeroed_page(GFP_KERNEL); + if (!vmcoreinfo_data) { + pr_warn("Memory allocation for vmcoreinfo_data failed\n"); + return -ENOMEM; + } + + vmcoreinfo_note = alloc_pages_exact(VMCOREINFO_NOTE_SIZE, + GFP_KERNEL | __GFP_ZERO); + if (!vmcoreinfo_note) { + free_page((unsigned long)vmcoreinfo_data); + vmcoreinfo_data = NULL; + pr_warn("Memory allocation for vmcoreinfo_note failed\n"); + return -ENOMEM; + } + + VMCOREINFO_OSRELEASE(init_uts_ns.name.release); + VMCOREINFO_BUILD_ID(); + VMCOREINFO_PAGESIZE(PAGE_SIZE); + + VMCOREINFO_SYMBOL(init_uts_ns); + VMCOREINFO_OFFSET(uts_namespace, name); + VMCOREINFO_SYMBOL(node_online_map); +#ifdef CONFIG_MMU + VMCOREINFO_SYMBOL_ARRAY(swapper_pg_dir); +#endif + VMCOREINFO_SYMBOL(_stext); + vmcoreinfo_append_str("NUMBER(VMALLOC_START)=0x%lx\n", (unsigned long) VMALLOC_START); + +#ifndef CONFIG_NUMA + VMCOREINFO_SYMBOL(mem_map); + VMCOREINFO_SYMBOL(contig_page_data); +#endif +#ifdef CONFIG_SPARSEMEM + VMCOREINFO_SYMBOL_ARRAY(mem_section); + VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS); + VMCOREINFO_STRUCT_SIZE(mem_section); + VMCOREINFO_OFFSET(mem_section, section_mem_map); + VMCOREINFO_NUMBER(SECTION_SIZE_BITS); + VMCOREINFO_NUMBER(MAX_PHYSMEM_BITS); +#endif + VMCOREINFO_STRUCT_SIZE(page); + VMCOREINFO_STRUCT_SIZE(pglist_data); + VMCOREINFO_STRUCT_SIZE(zone); + VMCOREINFO_STRUCT_SIZE(free_area); + VMCOREINFO_STRUCT_SIZE(list_head); + VMCOREINFO_SIZE(nodemask_t); + VMCOREINFO_OFFSET(page, flags); + VMCOREINFO_OFFSET(page, _refcount); + VMCOREINFO_OFFSET(page, mapping); + VMCOREINFO_OFFSET(page, lru); + VMCOREINFO_OFFSET(page, _mapcount); + VMCOREINFO_OFFSET(page, private); + VMCOREINFO_OFFSET(page, compound_head); + VMCOREINFO_OFFSET(pglist_data, node_zones); + VMCOREINFO_OFFSET(pglist_data, nr_zones); +#ifdef CONFIG_FLATMEM + VMCOREINFO_OFFSET(pglist_data, node_mem_map); +#endif + VMCOREINFO_OFFSET(pglist_data, node_start_pfn); + VMCOREINFO_OFFSET(pglist_data, node_spanned_pages); + VMCOREINFO_OFFSET(pglist_data, node_id); + VMCOREINFO_OFFSET(zone, free_area); + VMCOREINFO_OFFSET(zone, vm_stat); + VMCOREINFO_OFFSET(zone, spanned_pages); + VMCOREINFO_OFFSET(free_area, free_list); + VMCOREINFO_OFFSET(list_head, next); + VMCOREINFO_OFFSET(list_head, prev); + VMCOREINFO_LENGTH(zone.free_area, NR_PAGE_ORDERS); + log_buf_vmcoreinfo_setup(); + VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES); + VMCOREINFO_NUMBER(NR_FREE_PAGES); + VMCOREINFO_NUMBER(PG_lru); + VMCOREINFO_NUMBER(PG_private); + VMCOREINFO_NUMBER(PG_swapcache); + VMCOREINFO_NUMBER(PG_swapbacked); + VMCOREINFO_NUMBER(PG_slab); +#ifdef CONFIG_MEMORY_FAILURE + VMCOREINFO_NUMBER(PG_hwpoison); +#endif + VMCOREINFO_NUMBER(PG_head_mask); +#define PAGE_BUDDY_MAPCOUNT_VALUE (~PG_buddy) + VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE); +#ifdef CONFIG_HUGETLB_PAGE + VMCOREINFO_NUMBER(PG_hugetlb); +#define PAGE_OFFLINE_MAPCOUNT_VALUE (~PG_offline) + VMCOREINFO_NUMBER(PAGE_OFFLINE_MAPCOUNT_VALUE); +#endif + +#ifdef CONFIG_KALLSYMS + VMCOREINFO_SYMBOL(kallsyms_names); + VMCOREINFO_SYMBOL(kallsyms_num_syms); + VMCOREINFO_SYMBOL(kallsyms_token_table); + VMCOREINFO_SYMBOL(kallsyms_token_index); +#ifdef CONFIG_KALLSYMS_BASE_RELATIVE + VMCOREINFO_SYMBOL(kallsyms_offsets); + VMCOREINFO_SYMBOL(kallsyms_relative_base); +#else + VMCOREINFO_SYMBOL(kallsyms_addresses); +#endif /* CONFIG_KALLSYMS_BASE_RELATIVE */ +#endif /* CONFIG_KALLSYMS */ + + arch_crash_save_vmcoreinfo(); + update_vmcoreinfo_note(); + + return 0; +} + +subsys_initcall(crash_save_vmcoreinfo_init); diff --git a/lib/buildid.c b/lib/buildid.c index e3a7acdeef0ed4..3e6868c86b45a8 100644 --- a/lib/buildid.c +++ b/lib/buildid.c @@ -174,7 +174,7 @@ int build_id_parse_buf(const void *buf, unsigned char *build_id, u32 buf_size) return parse_build_id_buf(build_id, NULL, buf, buf_size); } -#if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID) || IS_ENABLED(CONFIG_CRASH_CORE) +#if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID) || IS_ENABLED(CONFIG_VMCORE_INFO) unsigned char vmlinux_build_id[BUILD_ID_SIZE_MAX] __ro_after_init; /** From 593b34d52d3b9dd4364eefcde8f52b9f1a915792 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Fri, 26 Jan 2024 08:57:44 +0800 Subject: [PATCH 472/707] crash: remove duplicated include in vmcore_info.c The header files kexec.h is included twice in vmcore_info.c, so one inclusion can be removed. Link: https://lkml.kernel.org/r/20240126005744.16561-1-yang.lee@linux.alibaba.com Signed-off-by: Yang Li Cc: Baoquan He Signed-off-by: Andrew Morton --- kernel/vmcore_info.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/vmcore_info.c b/kernel/vmcore_info.c index 8f48c0a42e2eed..8f77e238a54f54 100644 --- a/kernel/vmcore_info.c +++ b/kernel/vmcore_info.c @@ -13,7 +13,6 @@ #include #include #include -#include #include #include From 0a7afef35691c2c1a4dd5e7609f7de67d41713e7 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Wed, 24 Jan 2024 13:12:43 +0800 Subject: [PATCH 473/707] crash: remove dependency of FA_DUMP on CRASH_DUMP In kdump kernel, /proc/vmcore is an elf file mapping the crashed kernel's old memory content. Its elf header is constructed in 1st kernel and passed to kdump kernel via elfcorehdr_addr. Config CRASH_DUMP enables the code of 1st kernel's old memory accessing in different architectures. Currently, config FA_DUMP has dependency on CRASH_DUMP because fadump needs access global variable 'elfcorehdr_addr' to judge if it's in kdump kernel within function is_kdump_kernel(). In the current kernel/crash_dump.c, variable 'elfcorehdr_addr' is defined, and function setup_elfcorehdr() used to parse kernel parameter to fetch the passed value of elfcorehdr_addr. Only for accessing elfcorehdr_addr, FA_DUMP really doesn't have to depends on CRASH_DUMP. To remove the dependency of FA_DUMP on CRASH_DUMP to avoid confusion, rename kernel/crash_dump.c to kernel/elfcorehdr.c, and build it when CONFIG_VMCORE_INFO is ebabled. With this, FA_DUMP doesn't need to depend on CRASH_DUMP. Link: https://lkml.kernel.org/r/20240124051254.67105-4-bhe@redhat.com Signed-off-by: Baoquan He Cc: Al Viro Cc: Eric W. Biederman Cc: Hari Bathini Cc: Pingfan Liu Signed-off-by: Andrew Morton --- arch/powerpc/Kconfig | 1 - kernel/Makefile | 3 +-- kernel/{crash_dump.c => elfcorehdr.c} | 0 kernel/kexec_internal.h | 2 ++ 4 files changed, 3 insertions(+), 3 deletions(-) rename kernel/{crash_dump.c => elfcorehdr.c} (100%) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 495d197c9b2751..e66fd9923250ea 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -692,7 +692,6 @@ config FA_DUMP depends on PPC64 && (PPC_RTAS || PPC_POWERNV) select VMCORE_INFO select CRASH_RESERVE - select CRASH_DUMP help A robust mechanism to get reliable kernel crash dump with assistance from firmware. This approach does not use kexec, diff --git a/kernel/Makefile b/kernel/Makefile index 649272a1d6b9f8..35abc65e1f1ade 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -68,7 +68,7 @@ obj-$(CONFIG_MODULE_SIG_FORMAT) += module_signature.o obj-$(CONFIG_KALLSYMS) += kallsyms.o obj-$(CONFIG_KALLSYMS_SELFTEST) += kallsyms_selftest.o obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o -obj-$(CONFIG_VMCORE_INFO) += vmcore_info.o +obj-$(CONFIG_VMCORE_INFO) += vmcore_info.o elfcorehdr.o obj-$(CONFIG_CRASH_RESERVE) += crash_reserve.o obj-$(CONFIG_KEXEC_CORE) += kexec_core.o crash_core.o obj-$(CONFIG_KEXEC) += kexec.o @@ -121,7 +121,6 @@ obj-$(CONFIG_PERF_EVENTS) += events/ obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o obj-$(CONFIG_PADATA) += padata.o -obj-$(CONFIG_CRASH_DUMP) += crash_dump.o obj-$(CONFIG_JUMP_LABEL) += jump_label.o obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o obj-$(CONFIG_TORTURE_TEST) += torture.o diff --git a/kernel/crash_dump.c b/kernel/elfcorehdr.c similarity index 100% rename from kernel/crash_dump.c rename to kernel/elfcorehdr.c diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h index 74da1409cd14b5..2595defe8c0d92 100644 --- a/kernel/kexec_internal.h +++ b/kernel/kexec_internal.h @@ -4,6 +4,8 @@ #include +struct kexec_segment; + struct kimage *do_kimage_alloc_init(void); int sanity_check_segment_list(struct kimage *image); void kimage_free_page_list(struct list_head *list); From a0537ef1e32e78f1e5c1d8979bc8a95410f32747 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Wed, 24 Jan 2024 13:12:44 +0800 Subject: [PATCH 474/707] crash: split crash dumping code out from kexec_core.c Currently, KEXEC_CORE select CRASH_CORE automatically because crash codes need be built in to avoid compiling error when building kexec code even though the crash dumping functionality is not enabled. E.g -------------------- CONFIG_CRASH_CORE=y CONFIG_KEXEC_CORE=y CONFIG_KEXEC=y CONFIG_KEXEC_FILE=y --------------------- After splitting out crashkernel reservation code and vmcoreinfo exporting code, there's only crash related code left in kernel/crash_core.c. Now move crash related codes from kexec_core.c to crash_core.c and only build it in when CONFIG_CRASH_DUMP=y. And also wrap up crash codes inside CONFIG_CRASH_DUMP ifdeffery scope, or replace inappropriate CONFIG_KEXEC_CORE ifdef with CONFIG_CRASH_DUMP ifdef in generic kernel files. With these changes, crash_core codes are abstracted from kexec codes and can be disabled at all if only kexec reboot feature is wanted. Link: https://lkml.kernel.org/r/20240124051254.67105-5-bhe@redhat.com Signed-off-by: Baoquan He Cc: Al Viro Cc: Eric W. Biederman Cc: Hari Bathini Cc: Pingfan Liu Signed-off-by: Andrew Morton --- drivers/base/cpu.c | 6 +- include/linux/crash_core.h | 61 +++++++++ include/linux/kexec.h | 45 +------ init/initramfs.c | 2 +- kernel/Makefile | 3 +- kernel/crash_core.c | 256 +++++++++++++++++++++++++++++++++++++ kernel/kexec.c | 11 +- kernel/kexec_core.c | 250 ++---------------------------------- kernel/kexec_file.c | 13 +- kernel/ksysfs.c | 4 + 10 files changed, 359 insertions(+), 292 deletions(-) diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c index 47de0f140ba65e..b621a0fc75e15a 100644 --- a/drivers/base/cpu.c +++ b/drivers/base/cpu.c @@ -144,7 +144,7 @@ static DEVICE_ATTR(release, S_IWUSR, NULL, cpu_release_store); #endif /* CONFIG_ARCH_CPU_PROBE_RELEASE */ #endif /* CONFIG_HOTPLUG_CPU */ -#ifdef CONFIG_KEXEC_CORE +#ifdef CONFIG_CRASH_DUMP #include static ssize_t crash_notes_show(struct device *dev, @@ -189,14 +189,14 @@ static const struct attribute_group crash_note_cpu_attr_group = { #endif static const struct attribute_group *common_cpu_attr_groups[] = { -#ifdef CONFIG_KEXEC_CORE +#ifdef CONFIG_CRASH_DUMP &crash_note_cpu_attr_group, #endif NULL }; static const struct attribute_group *hotplugable_cpu_attr_groups[] = { -#ifdef CONFIG_KEXEC_CORE +#ifdef CONFIG_CRASH_DUMP &crash_note_cpu_attr_group, #endif NULL diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h index 7f19f62018ef9c..23270b16e1dbf3 100644 --- a/include/linux/crash_core.h +++ b/include/linux/crash_core.h @@ -6,6 +6,48 @@ #include #include +struct kimage; + +#ifdef CONFIG_CRASH_DUMP + +int crash_shrink_memory(unsigned long new_size); +ssize_t crash_get_memory_size(void); + +#ifndef arch_kexec_protect_crashkres +/* + * Protection mechanism for crashkernel reserved memory after + * the kdump kernel is loaded. + * + * Provide an empty default implementation here -- architecture + * code may override this + */ +static inline void arch_kexec_protect_crashkres(void) { } +#endif + +#ifndef arch_kexec_unprotect_crashkres +static inline void arch_kexec_unprotect_crashkres(void) { } +#endif + + + +#ifndef arch_crash_handle_hotplug_event +static inline void arch_crash_handle_hotplug_event(struct kimage *image) { } +#endif + +int crash_check_update_elfcorehdr(void); + +#ifndef crash_hotplug_cpu_support +static inline int crash_hotplug_cpu_support(void) { return 0; } +#endif + +#ifndef crash_hotplug_memory_support +static inline int crash_hotplug_memory_support(void) { return 0; } +#endif + +#ifndef crash_get_elfcorehdr_size +static inline unsigned int crash_get_elfcorehdr_size(void) { return 0; } +#endif + /* Alignment required for elf header segment */ #define ELF_CORE_HEADER_ALIGN 4096 @@ -31,4 +73,23 @@ struct kexec_segment; #define KEXEC_CRASH_HP_REMOVE_MEMORY 4 #define KEXEC_CRASH_HP_INVALID_CPU -1U +extern void __crash_kexec(struct pt_regs *regs); +extern void crash_kexec(struct pt_regs *regs); +int kexec_should_crash(struct task_struct *p); +int kexec_crash_loaded(void); +void crash_save_cpu(struct pt_regs *regs, int cpu); +extern int kimage_crash_copy_vmcoreinfo(struct kimage *image); + +#else /* !CONFIG_CRASH_DUMP*/ +struct pt_regs; +struct task_struct; +struct kimage; +static inline void __crash_kexec(struct pt_regs *regs) { } +static inline void crash_kexec(struct pt_regs *regs) { } +static inline int kexec_should_crash(struct task_struct *p) { return 0; } +static inline int kexec_crash_loaded(void) { return 0; } +static inline void crash_save_cpu(struct pt_regs *regs, int cpu) {}; +static inline int kimage_crash_copy_vmcoreinfo(struct kimage *image) { return 0; }; +#endif /* CONFIG_CRASH_DUMP*/ + #endif /* LINUX_CRASH_CORE_H */ diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 9c7bb8b56ed66d..060835bb82d52f 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -15,7 +15,6 @@ #if !defined(__ASSEMBLY__) -#include #include #include #include @@ -33,6 +32,7 @@ extern note_buf_t __percpu *crash_notes; #include #include #include +#include /* Verify architecture specific macros are defined */ @@ -380,13 +380,6 @@ extern struct page *kimage_alloc_control_pages(struct kimage *image, static inline int machine_kexec_post_load(struct kimage *image) { return 0; } #endif -extern void __crash_kexec(struct pt_regs *); -extern void crash_kexec(struct pt_regs *); -int kexec_should_crash(struct task_struct *); -int kexec_crash_loaded(void); -void crash_save_cpu(struct pt_regs *regs, int cpu); -extern int kimage_crash_copy_vmcoreinfo(struct kimage *image); - extern struct kimage *kexec_image; extern struct kimage *kexec_crash_image; @@ -410,24 +403,6 @@ bool kexec_load_permitted(int kexec_image_type); /* flag to track if kexec reboot is in progress */ extern bool kexec_in_progress; -int crash_shrink_memory(unsigned long new_size); -ssize_t crash_get_memory_size(void); - -#ifndef arch_kexec_protect_crashkres -/* - * Protection mechanism for crashkernel reserved memory after - * the kdump kernel is loaded. - * - * Provide an empty default implementation here -- architecture - * code may override this - */ -static inline void arch_kexec_protect_crashkres(void) { } -#endif - -#ifndef arch_kexec_unprotect_crashkres -static inline void arch_kexec_unprotect_crashkres(void) { } -#endif - #ifndef page_to_boot_pfn static inline unsigned long page_to_boot_pfn(struct page *page) { @@ -484,24 +459,6 @@ static inline int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, g static inline void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages) { } #endif -#ifndef arch_crash_handle_hotplug_event -static inline void arch_crash_handle_hotplug_event(struct kimage *image) { } -#endif - -int crash_check_update_elfcorehdr(void); - -#ifndef crash_hotplug_cpu_support -static inline int crash_hotplug_cpu_support(void) { return 0; } -#endif - -#ifndef crash_hotplug_memory_support -static inline int crash_hotplug_memory_support(void) { return 0; } -#endif - -#ifndef crash_get_elfcorehdr_size -static inline unsigned int crash_get_elfcorehdr_size(void) { return 0; } -#endif - extern bool kexec_file_dbg_print; #define kexec_dprintk(fmt, ...) \ diff --git a/init/initramfs.c b/init/initramfs.c index 76deb48c38cb16..6f095f54eec976 100644 --- a/init/initramfs.c +++ b/init/initramfs.c @@ -642,7 +642,7 @@ void __weak __init free_initrd_mem(unsigned long start, unsigned long end) "initrd"); } -#ifdef CONFIG_KEXEC_CORE +#ifdef CONFIG_CRASH_RESERVE static bool __init kexec_free_initrd(void) { unsigned long crashk_start = (unsigned long)__va(crashk_res.start); diff --git a/kernel/Makefile b/kernel/Makefile index 35abc65e1f1ade..3c13240dfc9f09 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -70,7 +70,8 @@ obj-$(CONFIG_KALLSYMS_SELFTEST) += kallsyms_selftest.o obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o obj-$(CONFIG_VMCORE_INFO) += vmcore_info.o elfcorehdr.o obj-$(CONFIG_CRASH_RESERVE) += crash_reserve.o -obj-$(CONFIG_KEXEC_CORE) += kexec_core.o crash_core.o +obj-$(CONFIG_KEXEC_CORE) += kexec_core.o +obj-$(CONFIG_CRASH_DUMP) += crash_core.o obj-$(CONFIG_KEXEC) += kexec.o obj-$(CONFIG_KEXEC_FILE) += kexec_file.o obj-$(CONFIG_KEXEC_ELF) += kexec_elf.o diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 2f4df1fe6f7af5..78b5dc7cee3ab7 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -11,9 +11,14 @@ #include #include #include +#include #include #include #include +#include +#include +#include +#include #include #include @@ -26,6 +31,131 @@ /* Per cpu memory for storing cpu states in case of system crash. */ note_buf_t __percpu *crash_notes; +#ifdef CONFIG_CRASH_DUMP + +int kimage_crash_copy_vmcoreinfo(struct kimage *image) +{ + struct page *vmcoreinfo_page; + void *safecopy; + + if (!IS_ENABLED(CONFIG_CRASH_DUMP)) + return 0; + if (image->type != KEXEC_TYPE_CRASH) + return 0; + + /* + * For kdump, allocate one vmcoreinfo safe copy from the + * crash memory. as we have arch_kexec_protect_crashkres() + * after kexec syscall, we naturally protect it from write + * (even read) access under kernel direct mapping. But on + * the other hand, we still need to operate it when crash + * happens to generate vmcoreinfo note, hereby we rely on + * vmap for this purpose. + */ + vmcoreinfo_page = kimage_alloc_control_pages(image, 0); + if (!vmcoreinfo_page) { + pr_warn("Could not allocate vmcoreinfo buffer\n"); + return -ENOMEM; + } + safecopy = vmap(&vmcoreinfo_page, 1, VM_MAP, PAGE_KERNEL); + if (!safecopy) { + pr_warn("Could not vmap vmcoreinfo buffer\n"); + return -ENOMEM; + } + + image->vmcoreinfo_data_copy = safecopy; + crash_update_vmcoreinfo_safecopy(safecopy); + + return 0; +} + + + +int kexec_should_crash(struct task_struct *p) +{ + /* + * If crash_kexec_post_notifiers is enabled, don't run + * crash_kexec() here yet, which must be run after panic + * notifiers in panic(). + */ + if (crash_kexec_post_notifiers) + return 0; + /* + * There are 4 panic() calls in make_task_dead() path, each of which + * corresponds to each of these 4 conditions. + */ + if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops) + return 1; + return 0; +} + +int kexec_crash_loaded(void) +{ + return !!kexec_crash_image; +} +EXPORT_SYMBOL_GPL(kexec_crash_loaded); + +/* + * No panic_cpu check version of crash_kexec(). This function is called + * only when panic_cpu holds the current CPU number; this is the only CPU + * which processes crash_kexec routines. + */ +void __noclone __crash_kexec(struct pt_regs *regs) +{ + /* Take the kexec_lock here to prevent sys_kexec_load + * running on one cpu from replacing the crash kernel + * we are using after a panic on a different cpu. + * + * If the crash kernel was not located in a fixed area + * of memory the xchg(&kexec_crash_image) would be + * sufficient. But since I reuse the memory... + */ + if (kexec_trylock()) { + if (kexec_crash_image) { + struct pt_regs fixed_regs; + + crash_setup_regs(&fixed_regs, regs); + crash_save_vmcoreinfo(); + machine_crash_shutdown(&fixed_regs); + machine_kexec(kexec_crash_image); + } + kexec_unlock(); + } +} +STACK_FRAME_NON_STANDARD(__crash_kexec); + +__bpf_kfunc void crash_kexec(struct pt_regs *regs) +{ + int old_cpu, this_cpu; + + /* + * Only one CPU is allowed to execute the crash_kexec() code as with + * panic(). Otherwise parallel calls of panic() and crash_kexec() + * may stop each other. To exclude them, we use panic_cpu here too. + */ + old_cpu = PANIC_CPU_INVALID; + this_cpu = raw_smp_processor_id(); + + if (atomic_try_cmpxchg(&panic_cpu, &old_cpu, this_cpu)) { + /* This is the 1st CPU which comes here, so go ahead. */ + __crash_kexec(regs); + + /* + * Reset panic_cpu to allow another panic()/crash_kexec() + * call. + */ + atomic_set(&panic_cpu, PANIC_CPU_INVALID); + } +} + +static inline resource_size_t crash_resource_size(const struct resource *res) +{ + return !res->end ? 0 : resource_size(res); +} + + + + int crash_prepare_elf64_headers(struct crash_mem *mem, int need_kernel_map, void **addr, unsigned long *sz) { @@ -187,6 +317,130 @@ int crash_exclude_mem_range(struct crash_mem *mem, return 0; } +ssize_t crash_get_memory_size(void) +{ + ssize_t size = 0; + + if (!kexec_trylock()) + return -EBUSY; + + size += crash_resource_size(&crashk_res); + size += crash_resource_size(&crashk_low_res); + + kexec_unlock(); + return size; +} + +static int __crash_shrink_memory(struct resource *old_res, + unsigned long new_size) +{ + struct resource *ram_res; + + ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL); + if (!ram_res) + return -ENOMEM; + + ram_res->start = old_res->start + new_size; + ram_res->end = old_res->end; + ram_res->flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM; + ram_res->name = "System RAM"; + + if (!new_size) { + release_resource(old_res); + old_res->start = 0; + old_res->end = 0; + } else { + crashk_res.end = ram_res->start - 1; + } + + crash_free_reserved_phys_range(ram_res->start, ram_res->end); + insert_resource(&iomem_resource, ram_res); + + return 0; +} + +int crash_shrink_memory(unsigned long new_size) +{ + int ret = 0; + unsigned long old_size, low_size; + + if (!kexec_trylock()) + return -EBUSY; + + if (kexec_crash_image) { + ret = -ENOENT; + goto unlock; + } + + low_size = crash_resource_size(&crashk_low_res); + old_size = crash_resource_size(&crashk_res) + low_size; + new_size = roundup(new_size, KEXEC_CRASH_MEM_ALIGN); + if (new_size >= old_size) { + ret = (new_size == old_size) ? 0 : -EINVAL; + goto unlock; + } + + /* + * (low_size > new_size) implies that low_size is greater than zero. + * This also means that if low_size is zero, the else branch is taken. + * + * If low_size is greater than 0, (low_size > new_size) indicates that + * crashk_low_res also needs to be shrunken. Otherwise, only crashk_res + * needs to be shrunken. + */ + if (low_size > new_size) { + ret = __crash_shrink_memory(&crashk_res, 0); + if (ret) + goto unlock; + + ret = __crash_shrink_memory(&crashk_low_res, new_size); + } else { + ret = __crash_shrink_memory(&crashk_res, new_size - low_size); + } + + /* Swap crashk_res and crashk_low_res if needed */ + if (!crashk_res.end && crashk_low_res.end) { + crashk_res.start = crashk_low_res.start; + crashk_res.end = crashk_low_res.end; + release_resource(&crashk_low_res); + crashk_low_res.start = 0; + crashk_low_res.end = 0; + insert_resource(&iomem_resource, &crashk_res); + } + +unlock: + kexec_unlock(); + return ret; +} + +void crash_save_cpu(struct pt_regs *regs, int cpu) +{ + struct elf_prstatus prstatus; + u32 *buf; + + if ((cpu < 0) || (cpu >= nr_cpu_ids)) + return; + + /* Using ELF notes here is opportunistic. + * I need a well defined structure format + * for the data I pass, and I need tags + * on the data to indicate what information I have + * squirrelled away. ELF notes happen to provide + * all of that, so there is no need to invent something new. + */ + buf = (u32 *)per_cpu_ptr(crash_notes, cpu); + if (!buf) + return; + memset(&prstatus, 0, sizeof(prstatus)); + prstatus.common.pr_pid = current->pid; + elf_core_copy_regs(&prstatus.pr_reg, regs); + buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS, + &prstatus, sizeof(prstatus)); + final_note(buf); +} + + + static int __init crash_notes_memory_init(void) { /* Allocate memory for saving cpu registers. */ @@ -220,6 +474,8 @@ static int __init crash_notes_memory_init(void) } subsys_initcall(crash_notes_memory_init); +#endif /*CONFIG_CRASH_DUMP*/ + #ifdef CONFIG_CRASH_HOTPLUG #undef pr_fmt #define pr_fmt(fmt) "crash hp: " fmt diff --git a/kernel/kexec.c b/kernel/kexec.c index 8f35a5a42af852..bab542fc1463d2 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -28,12 +28,14 @@ static int kimage_alloc_init(struct kimage **rimage, unsigned long entry, struct kimage *image; bool kexec_on_panic = flags & KEXEC_ON_CRASH; +#ifdef CONFIG_CRASH_DUMP if (kexec_on_panic) { /* Verify we have a valid entry point */ if ((entry < phys_to_boot_phys(crashk_res.start)) || (entry > phys_to_boot_phys(crashk_res.end))) return -EADDRNOTAVAIL; } +#endif /* Allocate and initialize a controlling structure */ image = do_kimage_alloc_init(); @@ -44,11 +46,13 @@ static int kimage_alloc_init(struct kimage **rimage, unsigned long entry, image->nr_segments = nr_segments; memcpy(image->segment, segments, nr_segments * sizeof(*segments)); +#ifdef CONFIG_CRASH_DUMP if (kexec_on_panic) { /* Enable special crash kernel control page alloc policy. */ image->control_page = crashk_res.start; image->type = KEXEC_TYPE_CRASH; } +#endif ret = sanity_check_segment_list(image); if (ret) @@ -99,13 +103,14 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments, if (!kexec_trylock()) return -EBUSY; +#ifdef CONFIG_CRASH_DUMP if (flags & KEXEC_ON_CRASH) { dest_image = &kexec_crash_image; if (kexec_crash_image) arch_kexec_unprotect_crashkres(); - } else { + } else +#endif dest_image = &kexec_image; - } if (nr_segments == 0) { /* Uninstall image */ @@ -162,8 +167,10 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments, image = xchg(dest_image, image); out: +#ifdef CONFIG_CRASH_DUMP if ((flags & KEXEC_ON_CRASH) && kexec_crash_image) arch_kexec_protect_crashkres(); +#endif kimage_free(image); out_unlock: diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index d08fc7b5db9790..ce3429e7972ccd 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -54,30 +54,6 @@ bool kexec_in_progress = false; bool kexec_file_dbg_print; -int kexec_should_crash(struct task_struct *p) -{ - /* - * If crash_kexec_post_notifiers is enabled, don't run - * crash_kexec() here yet, which must be run after panic - * notifiers in panic(). - */ - if (crash_kexec_post_notifiers) - return 0; - /* - * There are 4 panic() calls in make_task_dead() path, each of which - * corresponds to each of these 4 conditions. - */ - if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops) - return 1; - return 0; -} - -int kexec_crash_loaded(void) -{ - return !!kexec_crash_image; -} -EXPORT_SYMBOL_GPL(kexec_crash_loaded); - /* * When kexec transitions to the new kernel there is a one-to-one * mapping between physical and virtual addresses. On processors @@ -209,6 +185,7 @@ int sanity_check_segment_list(struct kimage *image) if (total_pages > nr_pages / 2) return -EINVAL; +#ifdef CONFIG_CRASH_DUMP /* * Verify we have good destination addresses. Normally * the caller is responsible for making certain we don't @@ -231,6 +208,7 @@ int sanity_check_segment_list(struct kimage *image) return -EADDRNOTAVAIL; } } +#endif return 0; } @@ -403,6 +381,7 @@ static struct page *kimage_alloc_normal_control_pages(struct kimage *image, return pages; } +#ifdef CONFIG_CRASH_DUMP static struct page *kimage_alloc_crash_control_pages(struct kimage *image, unsigned int order) { @@ -468,6 +447,7 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image, return pages; } +#endif struct page *kimage_alloc_control_pages(struct kimage *image, @@ -479,48 +459,16 @@ struct page *kimage_alloc_control_pages(struct kimage *image, case KEXEC_TYPE_DEFAULT: pages = kimage_alloc_normal_control_pages(image, order); break; +#ifdef CONFIG_CRASH_DUMP case KEXEC_TYPE_CRASH: pages = kimage_alloc_crash_control_pages(image, order); break; +#endif } return pages; } -int kimage_crash_copy_vmcoreinfo(struct kimage *image) -{ - struct page *vmcoreinfo_page; - void *safecopy; - - if (image->type != KEXEC_TYPE_CRASH) - return 0; - - /* - * For kdump, allocate one vmcoreinfo safe copy from the - * crash memory. as we have arch_kexec_protect_crashkres() - * after kexec syscall, we naturally protect it from write - * (even read) access under kernel direct mapping. But on - * the other hand, we still need to operate it when crash - * happens to generate vmcoreinfo note, hereby we rely on - * vmap for this purpose. - */ - vmcoreinfo_page = kimage_alloc_control_pages(image, 0); - if (!vmcoreinfo_page) { - pr_warn("Could not allocate vmcoreinfo buffer\n"); - return -ENOMEM; - } - safecopy = vmap(&vmcoreinfo_page, 1, VM_MAP, PAGE_KERNEL); - if (!safecopy) { - pr_warn("Could not vmap vmcoreinfo buffer\n"); - return -ENOMEM; - } - - image->vmcoreinfo_data_copy = safecopy; - crash_update_vmcoreinfo_safecopy(safecopy); - - return 0; -} - static int kimage_add_entry(struct kimage *image, kimage_entry_t entry) { if (*image->entry != 0) @@ -603,10 +551,12 @@ void kimage_free(struct kimage *image) if (!image) return; +#ifdef CONFIG_CRASH_DUMP if (image->vmcoreinfo_data_copy) { crash_update_vmcoreinfo_safecopy(NULL); vunmap(image->vmcoreinfo_data_copy); } +#endif kimage_free_extra_pages(image); for_each_kimage_entry(image, ptr, entry) { @@ -824,6 +774,7 @@ static int kimage_load_normal_segment(struct kimage *image, return result; } +#ifdef CONFIG_CRASH_DUMP static int kimage_load_crash_segment(struct kimage *image, struct kexec_segment *segment) { @@ -891,6 +842,7 @@ static int kimage_load_crash_segment(struct kimage *image, out: return result; } +#endif int kimage_load_segment(struct kimage *image, struct kexec_segment *segment) @@ -901,9 +853,11 @@ int kimage_load_segment(struct kimage *image, case KEXEC_TYPE_DEFAULT: result = kimage_load_normal_segment(image, segment); break; +#ifdef CONFIG_CRASH_DUMP case KEXEC_TYPE_CRASH: result = kimage_load_crash_segment(image, segment); break; +#endif } return result; @@ -1027,186 +981,6 @@ bool kexec_load_permitted(int kexec_image_type) return true; } -/* - * No panic_cpu check version of crash_kexec(). This function is called - * only when panic_cpu holds the current CPU number; this is the only CPU - * which processes crash_kexec routines. - */ -void __noclone __crash_kexec(struct pt_regs *regs) -{ - /* Take the kexec_lock here to prevent sys_kexec_load - * running on one cpu from replacing the crash kernel - * we are using after a panic on a different cpu. - * - * If the crash kernel was not located in a fixed area - * of memory the xchg(&kexec_crash_image) would be - * sufficient. But since I reuse the memory... - */ - if (kexec_trylock()) { - if (kexec_crash_image) { - struct pt_regs fixed_regs; - - crash_setup_regs(&fixed_regs, regs); - crash_save_vmcoreinfo(); - machine_crash_shutdown(&fixed_regs); - machine_kexec(kexec_crash_image); - } - kexec_unlock(); - } -} -STACK_FRAME_NON_STANDARD(__crash_kexec); - -__bpf_kfunc void crash_kexec(struct pt_regs *regs) -{ - int old_cpu, this_cpu; - - /* - * Only one CPU is allowed to execute the crash_kexec() code as with - * panic(). Otherwise parallel calls of panic() and crash_kexec() - * may stop each other. To exclude them, we use panic_cpu here too. - */ - old_cpu = PANIC_CPU_INVALID; - this_cpu = raw_smp_processor_id(); - - if (atomic_try_cmpxchg(&panic_cpu, &old_cpu, this_cpu)) { - /* This is the 1st CPU which comes here, so go ahead. */ - __crash_kexec(regs); - - /* - * Reset panic_cpu to allow another panic()/crash_kexec() - * call. - */ - atomic_set(&panic_cpu, PANIC_CPU_INVALID); - } -} - -static inline resource_size_t crash_resource_size(const struct resource *res) -{ - return !res->end ? 0 : resource_size(res); -} - -ssize_t crash_get_memory_size(void) -{ - ssize_t size = 0; - - if (!kexec_trylock()) - return -EBUSY; - - size += crash_resource_size(&crashk_res); - size += crash_resource_size(&crashk_low_res); - - kexec_unlock(); - return size; -} - -static int __crash_shrink_memory(struct resource *old_res, - unsigned long new_size) -{ - struct resource *ram_res; - - ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL); - if (!ram_res) - return -ENOMEM; - - ram_res->start = old_res->start + new_size; - ram_res->end = old_res->end; - ram_res->flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM; - ram_res->name = "System RAM"; - - if (!new_size) { - release_resource(old_res); - old_res->start = 0; - old_res->end = 0; - } else { - crashk_res.end = ram_res->start - 1; - } - - crash_free_reserved_phys_range(ram_res->start, ram_res->end); - insert_resource(&iomem_resource, ram_res); - - return 0; -} - -int crash_shrink_memory(unsigned long new_size) -{ - int ret = 0; - unsigned long old_size, low_size; - - if (!kexec_trylock()) - return -EBUSY; - - if (kexec_crash_image) { - ret = -ENOENT; - goto unlock; - } - - low_size = crash_resource_size(&crashk_low_res); - old_size = crash_resource_size(&crashk_res) + low_size; - new_size = roundup(new_size, KEXEC_CRASH_MEM_ALIGN); - if (new_size >= old_size) { - ret = (new_size == old_size) ? 0 : -EINVAL; - goto unlock; - } - - /* - * (low_size > new_size) implies that low_size is greater than zero. - * This also means that if low_size is zero, the else branch is taken. - * - * If low_size is greater than 0, (low_size > new_size) indicates that - * crashk_low_res also needs to be shrunken. Otherwise, only crashk_res - * needs to be shrunken. - */ - if (low_size > new_size) { - ret = __crash_shrink_memory(&crashk_res, 0); - if (ret) - goto unlock; - - ret = __crash_shrink_memory(&crashk_low_res, new_size); - } else { - ret = __crash_shrink_memory(&crashk_res, new_size - low_size); - } - - /* Swap crashk_res and crashk_low_res if needed */ - if (!crashk_res.end && crashk_low_res.end) { - crashk_res.start = crashk_low_res.start; - crashk_res.end = crashk_low_res.end; - release_resource(&crashk_low_res); - crashk_low_res.start = 0; - crashk_low_res.end = 0; - insert_resource(&iomem_resource, &crashk_res); - } - -unlock: - kexec_unlock(); - return ret; -} - -void crash_save_cpu(struct pt_regs *regs, int cpu) -{ - struct elf_prstatus prstatus; - u32 *buf; - - if ((cpu < 0) || (cpu >= nr_cpu_ids)) - return; - - /* Using ELF notes here is opportunistic. - * I need a well defined structure format - * for the data I pass, and I need tags - * on the data to indicate what information I have - * squirrelled away. ELF notes happen to provide - * all of that, so there is no need to invent something new. - */ - buf = (u32 *)per_cpu_ptr(crash_notes, cpu); - if (!buf) - return; - memset(&prstatus, 0, sizeof(prstatus)); - prstatus.common.pr_pid = current->pid; - elf_core_copy_regs(&prstatus.pr_reg, regs); - buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS, - &prstatus, sizeof(prstatus)); - final_note(buf); -} - /* * Move into place and start executing a preloaded standalone * executable. If nothing was preloaded return an error. diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index bef2f6f2571b42..ce7ce2ae27cdfe 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -285,11 +285,13 @@ kimage_file_alloc_init(struct kimage **rimage, int kernel_fd, kexec_file_dbg_print = !!(flags & KEXEC_FILE_DEBUG); image->file_mode = 1; +#ifdef CONFIG_CRASH_DUMP if (kexec_on_panic) { /* Enable special crash kernel control page alloc policy. */ image->control_page = crashk_res.start; image->type = KEXEC_TYPE_CRASH; } +#endif ret = kimage_file_prepare_segments(image, kernel_fd, initrd_fd, cmdline_ptr, cmdline_len, flags); @@ -349,13 +351,14 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd, if (!kexec_trylock()) return -EBUSY; +#ifdef CONFIG_CRASH_DUMP if (image_type == KEXEC_TYPE_CRASH) { dest_image = &kexec_crash_image; if (kexec_crash_image) arch_kexec_unprotect_crashkres(); - } else { + } else +#endif dest_image = &kexec_image; - } if (flags & KEXEC_FILE_UNLOAD) goto exchange; @@ -419,8 +422,10 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd, exchange: image = xchg(dest_image, image); out: +#ifdef CONFIG_CRASH_DUMP if ((flags & KEXEC_FILE_ON_CRASH) && kexec_crash_image) arch_kexec_protect_crashkres(); +#endif kexec_unlock(); kimage_free(image); @@ -595,12 +600,14 @@ static int kexec_walk_memblock(struct kexec_buf *kbuf, static int kexec_walk_resources(struct kexec_buf *kbuf, int (*func)(struct resource *, void *)) { +#ifdef CONFIG_CRASH_DUMP if (kbuf->image->type == KEXEC_TYPE_CRASH) return walk_iomem_res_desc(crashk_res.desc, IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY, crashk_res.start, crashk_res.end, kbuf, func); - else if (kbuf->top_down) +#endif + if (kbuf->top_down) return walk_system_ram_res_rev(0, ULONG_MAX, kbuf, func); else return walk_system_ram_res(0, ULONG_MAX, kbuf, func); diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 11526fc42bc24c..fe7a517fc4abbf 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -120,6 +120,7 @@ static ssize_t kexec_loaded_show(struct kobject *kobj, } KERNEL_ATTR_RO(kexec_loaded); +#ifdef CONFIG_CRASH_DUMP static ssize_t kexec_crash_loaded_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -152,6 +153,7 @@ static ssize_t kexec_crash_size_store(struct kobject *kobj, } KERNEL_ATTR_RW(kexec_crash_size); +#endif /* CONFIG_CRASH_DUMP*/ #endif /* CONFIG_KEXEC_CORE */ #ifdef CONFIG_VMCORE_INFO @@ -262,9 +264,11 @@ static struct attribute * kernel_attrs[] = { #endif #ifdef CONFIG_KEXEC_CORE &kexec_loaded_attr.attr, +#ifdef CONFIG_CRASH_DUMP &kexec_crash_loaded_attr.attr, &kexec_crash_size_attr.attr, #endif +#endif #ifdef CONFIG_VMCORE_INFO &vmcoreinfo_attr.attr, #ifdef CONFIG_CRASH_HOTPLUG From 945e488db1666b3a765c7ad65dee061a3d2099e1 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Wed, 24 Jan 2024 13:12:45 +0800 Subject: [PATCH 475/707] crash: clean up kdump related config items By splitting CRASH_RESERVE and VMCORE_INFO out from CRASH_CORE, cleaning up the dependency of FA_DMUMP on CRASH_DUMP, and moving crash codes from kexec_core.c to crash_core.c, now we can rearrange CRASH_DUMP to depend on KEXEC_CORE, and make CRASH_DUMP select CRASH_RESERVE and VMCORE_INFO. KEXEC_CORE won't select CRASH_RESERVE and VMCORE_INFO any more because KEXEC_CORE enables codes which allocate control pages, copy kexec/kdump segments, and prepare for switching. These codes are shared by both kexec reboot and crash dumping. Doing this makes codes and the corresponding config items more logical (the right item depends on or is selected by the left item). PROC_KCORE -----------> VMCORE_INFO |----------> VMCORE_INFO FA_DUMP----| |----------> CRASH_RESERVE ---->VMCORE_INFO / |---->CRASH_RESERVE KEXEC --| /| |--> KEXEC_CORE--> CRASH_DUMP-->/-|---->PROC_VMCORE KEXEC_FILE --| \ | \---->CRASH_HOTPLUG KEXEC --| |--> KEXEC_CORE--> kexec reboot KEXEC_FILE --| Link: https://lkml.kernel.org/r/20240124051254.67105-6-bhe@redhat.com Signed-off-by: Baoquan He Cc: Al Viro Cc: Eric W. Biederman Cc: Hari Bathini Cc: Pingfan Liu Signed-off-by: Andrew Morton --- kernel/Kconfig.kexec | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec index 8faf27043432fe..6c34e63c88ff4c 100644 --- a/kernel/Kconfig.kexec +++ b/kernel/Kconfig.kexec @@ -9,8 +9,6 @@ config VMCORE_INFO bool config KEXEC_CORE - select VMCORE_INFO - select CRASH_RESERVE bool config KEXEC_ELF @@ -99,8 +97,11 @@ config KEXEC_JUMP config CRASH_DUMP bool "kernel crash dumps" + default y depends on ARCH_SUPPORTS_CRASH_DUMP - select KEXEC_CORE + depends on KEXEC_CORE + select VMCORE_INFO + select CRASH_RESERVE help Generate crash dump after being started by kexec. This should be normally only set in special crash dump kernels From 0d59e53ed6c3b33199d7a9f916278806e8b06661 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Wed, 24 Jan 2024 13:12:46 +0800 Subject: [PATCH 476/707] x86, crash: wrap crash dumping code into crash related ifdefs Now crash codes under kernel/ folder has been split out from kexec code, crash dumping can be separated from kexec reboot in config items on x86 with some adjustments. Here, also change some ifdefs or IS_ENABLED() check to more appropriate ones, e,g - #ifdef CONFIG_KEXEC_CORE -> #ifdef CONFIG_CRASH_DUMP - (!IS_ENABLED(CONFIG_KEXEC_CORE)) - > (!IS_ENABLED(CONFIG_CRASH_RESERVE)) Link: https://lkml.kernel.org/r/20240124051254.67105-7-bhe@redhat.com Signed-off-by: Baoquan He Cc: Al Viro Cc: Eric W. Biederman Cc: Hari Bathini Cc: Pingfan Liu Signed-off-by: Andrew Morton --- arch/x86/kernel/Makefile | 4 ++-- arch/x86/kernel/cpu/mshyperv.c | 4 ++++ arch/x86/kernel/kexec-bzimage64.c | 4 ++++ arch/x86/kernel/kvm.c | 4 ++-- arch/x86/kernel/machine_kexec_64.c | 3 +++ arch/x86/kernel/reboot.c | 2 +- arch/x86/kernel/setup.c | 2 +- arch/x86/kernel/smp.c | 2 +- arch/x86/xen/enlighten_hvm.c | 4 ++++ 9 files changed, 22 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 913d4022131eba..3668b1edef2d28 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -100,9 +100,9 @@ obj-$(CONFIG_TRACING) += trace.o obj-$(CONFIG_RETHOOK) += rethook.o obj-$(CONFIG_VMCORE_INFO) += vmcore_info_$(BITS).o obj-$(CONFIG_KEXEC_CORE) += machine_kexec_$(BITS).o -obj-$(CONFIG_KEXEC_CORE) += relocate_kernel_$(BITS).o crash.o +obj-$(CONFIG_KEXEC_CORE) += relocate_kernel_$(BITS).o obj-$(CONFIG_KEXEC_FILE) += kexec-bzimage64.o -obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o +obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o crash.o obj-y += kprobes/ obj-$(CONFIG_MODULES) += module.o obj-$(CONFIG_X86_32) += doublefault_32.o diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 01fa06dd06b66c..f8163a59026ba5 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -210,6 +210,7 @@ static void hv_machine_shutdown(void) hyperv_cleanup(); } +#ifdef CONFIG_CRASH_DUMP static void hv_machine_crash_shutdown(struct pt_regs *regs) { if (hv_crash_handler) @@ -221,6 +222,7 @@ static void hv_machine_crash_shutdown(struct pt_regs *regs) /* Disable the hypercall page when there is only 1 active CPU. */ hyperv_cleanup(); } +#endif #endif /* CONFIG_KEXEC_CORE */ #endif /* CONFIG_HYPERV */ @@ -497,7 +499,9 @@ static void __init ms_hyperv_init_platform(void) #if IS_ENABLED(CONFIG_HYPERV) && defined(CONFIG_KEXEC_CORE) machine_ops.shutdown = hv_machine_shutdown; +#ifdef CONFIG_CRASH_DUMP machine_ops.crash_shutdown = hv_machine_crash_shutdown; +#endif #endif if (ms_hyperv.features & HV_ACCESS_TSC_INVARIANT) { /* diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c index 2a422e00ed4b42..b55737b83a841a 100644 --- a/arch/x86/kernel/kexec-bzimage64.c +++ b/arch/x86/kernel/kexec-bzimage64.c @@ -263,11 +263,13 @@ setup_boot_parameters(struct kimage *image, struct boot_params *params, memset(¶ms->hd0_info, 0, sizeof(params->hd0_info)); memset(¶ms->hd1_info, 0, sizeof(params->hd1_info)); +#ifdef CONFIG_CRASH_DUMP if (image->type == KEXEC_TYPE_CRASH) { ret = crash_setup_memmap_entries(image, params); if (ret) return ret; } else +#endif setup_e820_entries(params); nr_e820_entries = params->e820_entries; @@ -433,12 +435,14 @@ static void *bzImage64_load(struct kimage *image, char *kernel, return ERR_PTR(-EINVAL); } +#ifdef CONFIG_CRASH_DUMP /* Allocate and load backup region */ if (image->type == KEXEC_TYPE_CRASH) { ret = crash_load_segments(image); if (ret) return ERR_PTR(ret); } +#endif /* * Load purgatory. For 64bit entry point, purgatory code can be diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index dfe9945b9becee..acfc2d3183bce6 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -769,7 +769,7 @@ static struct notifier_block kvm_pv_reboot_nb = { * won't be valid. In cases like kexec, in which you install a new kernel, this * means a random memory location will be kept being written. */ -#ifdef CONFIG_KEXEC_CORE +#ifdef CONFIG_CRASH_DUMP static void kvm_crash_shutdown(struct pt_regs *regs) { kvm_guest_cpu_offline(true); @@ -852,7 +852,7 @@ static void __init kvm_guest_init(void) kvm_guest_cpu_init(); #endif -#ifdef CONFIG_KEXEC_CORE +#ifdef CONFIG_CRASH_DUMP machine_ops.crash_shutdown = kvm_crash_shutdown; #endif diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index bc0a5348b4a627..b180d8e497c317 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -508,6 +508,8 @@ int arch_kimage_file_post_load_cleanup(struct kimage *image) } #endif /* CONFIG_KEXEC_FILE */ +#ifdef CONFIG_CRASH_DUMP + static int kexec_mark_range(unsigned long start, unsigned long end, bool protect) { @@ -552,6 +554,7 @@ void arch_kexec_unprotect_crashkres(void) { kexec_mark_crashkres(false); } +#endif /* * During a traditional boot under SME, SME will encrypt the kernel, diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 830425e6d38e2f..1287b0d5962f7f 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -796,7 +796,7 @@ struct machine_ops machine_ops __ro_after_init = { .emergency_restart = native_machine_emergency_restart, .restart = native_machine_restart, .halt = native_machine_halt, -#ifdef CONFIG_KEXEC_CORE +#ifdef CONFIG_CRASH_DUMP .crash_shutdown = native_machine_crash_shutdown, #endif }; diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 84201071dfacd1..899d839a2954a7 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -471,7 +471,7 @@ static void __init arch_reserve_crashkernel(void) bool high = false; int ret; - if (!IS_ENABLED(CONFIG_KEXEC_CORE)) + if (!IS_ENABLED(CONFIG_CRASH_RESERVE)) return; ret = parse_crashkernel(cmdline, memblock_phys_mem_size(), diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 96a771f9f930a6..52c3823b721191 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -282,7 +282,7 @@ struct smp_ops smp_ops = { .smp_cpus_done = native_smp_cpus_done, .stop_other_cpus = native_stop_other_cpus, -#if defined(CONFIG_KEXEC_CORE) +#if defined(CONFIG_CRASH_DUMP) .crash_stop_other_cpus = kdump_nmi_shootdown_cpus, #endif .smp_send_reschedule = native_smp_send_reschedule, diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c index 3f8c34707c5001..09e3db7ff99066 100644 --- a/arch/x86/xen/enlighten_hvm.c +++ b/arch/x86/xen/enlighten_hvm.c @@ -149,12 +149,14 @@ static void xen_hvm_shutdown(void) xen_reboot(SHUTDOWN_soft_reset); } +#ifdef CONFIG_CRASH_DUMP static void xen_hvm_crash_shutdown(struct pt_regs *regs) { native_machine_crash_shutdown(regs); xen_reboot(SHUTDOWN_soft_reset); } #endif +#endif static int xen_cpu_up_prepare_hvm(unsigned int cpu) { @@ -236,8 +238,10 @@ static void __init xen_hvm_guest_init(void) #ifdef CONFIG_KEXEC_CORE machine_ops.shutdown = xen_hvm_shutdown; +#ifdef CONFIG_CRASH_DUMP machine_ops.crash_shutdown = xen_hvm_crash_shutdown; #endif +#endif } static __init int xen_parse_nopv(char *arg) From 674314edd59dba87224bf477afd901abf3c6d441 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Wed, 24 Jan 2024 13:12:47 +0800 Subject: [PATCH 477/707] arm64, crash: wrap crash dumping code into crash related ifdefs Now crash codes under kernel/ folder has been split out from kexec code, crash dumping can be separated from kexec reboot in config items on arm64 with some adjustments. Here wrap up crash dumping codes with CONFIG_CRASH_DUMP ifdeffery. Link: https://lkml.kernel.org/r/20240124051254.67105-8-bhe@redhat.com Signed-off-by: Baoquan He Cc: Al Viro Cc: Eric W. Biederman Cc: Hari Bathini Cc: Pingfan Liu Signed-off-by: Andrew Morton --- arch/arm64/include/asm/kexec.h | 2 +- arch/arm64/kernel/machine_kexec.c | 2 +- arch/arm64/kernel/machine_kexec_file.c | 10 ++++++++-- arch/arm64/mm/init.c | 2 +- 4 files changed, 11 insertions(+), 5 deletions(-) diff --git a/arch/arm64/include/asm/kexec.h b/arch/arm64/include/asm/kexec.h index 9ac9572a3bbee2..4d9cc7a76d9ca1 100644 --- a/arch/arm64/include/asm/kexec.h +++ b/arch/arm64/include/asm/kexec.h @@ -80,7 +80,7 @@ static inline void crash_setup_regs(struct pt_regs *newregs, } } -#if defined(CONFIG_KEXEC_CORE) && defined(CONFIG_HIBERNATION) +#if defined(CONFIG_CRASH_DUMP) && defined(CONFIG_HIBERNATION) extern bool crash_is_nosave(unsigned long pfn); extern void crash_prepare_suspend(void); extern void crash_post_resume(void); diff --git a/arch/arm64/kernel/machine_kexec.c b/arch/arm64/kernel/machine_kexec.c index b38aae5b488d07..82e2203d86a31f 100644 --- a/arch/arm64/kernel/machine_kexec.c +++ b/arch/arm64/kernel/machine_kexec.c @@ -255,7 +255,7 @@ void machine_crash_shutdown(struct pt_regs *regs) pr_info("Starting crashdump kernel...\n"); } -#ifdef CONFIG_HIBERNATION +#if defined(CONFIG_CRASH_DUMP) && defined(CONFIG_HIBERNATION) /* * To preserve the crash dump kernel image, the relevant memory segments * should be mapped again around the hibernation. diff --git a/arch/arm64/kernel/machine_kexec_file.c b/arch/arm64/kernel/machine_kexec_file.c index 0e017358f4ba64..af1ca875c52ce2 100644 --- a/arch/arm64/kernel/machine_kexec_file.c +++ b/arch/arm64/kernel/machine_kexec_file.c @@ -39,6 +39,7 @@ int arch_kimage_file_post_load_cleanup(struct kimage *image) return kexec_image_post_load_cleanup_default(image); } +#ifdef CONFIG_CRASH_DUMP static int prepare_elf_headers(void **addr, unsigned long *sz) { struct crash_mem *cmem; @@ -80,6 +81,7 @@ static int prepare_elf_headers(void **addr, unsigned long *sz) kfree(cmem); return ret; } +#endif /* * Tries to add the initrd and DTB to the image. If it is not possible to find @@ -93,8 +95,8 @@ int load_other_segments(struct kimage *image, char *cmdline) { struct kexec_buf kbuf; - void *headers, *dtb = NULL; - unsigned long headers_sz, initrd_load_addr = 0, dtb_len, + void *dtb = NULL; + unsigned long initrd_load_addr = 0, dtb_len, orig_segments = image->nr_segments; int ret = 0; @@ -102,7 +104,10 @@ int load_other_segments(struct kimage *image, /* not allocate anything below the kernel */ kbuf.buf_min = kernel_load_addr + kernel_size; +#ifdef CONFIG_CRASH_DUMP /* load elf core header */ + void *headers; + unsigned long headers_sz; if (image->type == KEXEC_TYPE_CRASH) { ret = prepare_elf_headers(&headers, &headers_sz); if (ret) { @@ -130,6 +135,7 @@ int load_other_segments(struct kimage *image, kexec_dprintk("Loaded elf core header at 0x%lx bufsz=0x%lx memsz=0x%lx\n", image->elf_load_addr, kbuf.bufsz, kbuf.memsz); } +#endif /* load initrd */ if (initrd) { diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 74c1db8ce271d8..c1f6213e77f328 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -100,7 +100,7 @@ static void __init arch_reserve_crashkernel(void) bool high = false; int ret; - if (!IS_ENABLED(CONFIG_KEXEC_CORE)) + if (!IS_ENABLED(CONFIG_CRASH_RESERVE)) return; ret = parse_crashkernel(cmdline, memblock_phys_mem_size(), From ed4ab93b8dca4d638afd18dab2a1d3d930f1c882 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Wed, 24 Jan 2024 13:12:48 +0800 Subject: [PATCH 478/707] ppc, crash: enforce KEXEC and KEXEC_FILE to select CRASH_DUMP In PowerPC, the crash dumping and kexec reboot share code in arch_kexec_locate_mem_hole(), in which struct crash_mem is used. Here enfoce enforce KEXEC and KEXEC_FILE to select CRASH_DUMP for now. Link: https://lkml.kernel.org/r/20240124051254.67105-9-bhe@redhat.com Signed-off-by: Baoquan He Cc: Al Viro Cc: Eric W. Biederman Cc: Hari Bathini Cc: Pingfan Liu Signed-off-by: Andrew Morton --- arch/powerpc/Kconfig | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index e66fd9923250ea..31f013e636e397 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -608,6 +608,10 @@ config PPC64_SUPPORTS_MEMORY_FAILURE config ARCH_SUPPORTS_KEXEC def_bool PPC_BOOK3S || PPC_E500 || (44x && !SMP) +config ARCH_SELECTS_KEXEC + def_bool y + select CRASH_DUMP + config ARCH_SUPPORTS_KEXEC_FILE def_bool PPC64 @@ -618,6 +622,7 @@ config ARCH_SELECTS_KEXEC_FILE def_bool y depends on KEXEC_FILE select KEXEC_ELF + select CRASH_DUMP select HAVE_IMA_KEXEC if IMA config PPC64_BIG_ENDIAN_ELF_ABI_V2 From 34847c970d7feb3460c577e2bc9b5d54dca8c53b Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Thu, 25 Jan 2024 22:29:07 +0800 Subject: [PATCH 479/707] ppc-crash-enforce-kexec-and-kexec_file-to-select-crash_dump-fix I reproduced the failure with allnoconfig on ppc, and found below change can fix it too. And the change makes ARCH_SELECTS_KEXEC consistent with ARCH_SELECTS_KEXEC_FILE on the dependency. What do you think? Link: https://lkml.kernel.org/r/ZbJwMyCpz4HDySoo@MiWiFi-R3L-srv Signed-off-by: Baoquan He Reported-by: Stephen Rothwell Signed-off-by: Andrew Morton --- arch/powerpc/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 31f013e636e397..79f98cd5f2c907 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -610,6 +610,7 @@ config ARCH_SUPPORTS_KEXEC config ARCH_SELECTS_KEXEC def_bool y + depends on KEXEC select CRASH_DUMP config ARCH_SUPPORTS_KEXEC_FILE From 1819823eed348ec15d25f82ff2afeeaf73d15430 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Wed, 24 Jan 2024 13:12:49 +0800 Subject: [PATCH 480/707] s390, crash: wrap crash dumping code into crash related ifdefs Now crash codes under kernel/ folder has been split out from kexec code, crash dumping can be separated from kexec reboot in config items on s390 with some adjustments. Here wrap up crash dumping codes with CONFIG_CRASH_DUMP ifdeffery. Link: https://lkml.kernel.org/r/20240124051254.67105-10-bhe@redhat.com Signed-off-by: Baoquan He Cc: Al Viro Cc: Eric W. Biederman Cc: Hari Bathini Cc: Pingfan Liu Signed-off-by: Andrew Morton --- arch/s390/kernel/kexec_elf.c | 2 ++ arch/s390/kernel/kexec_image.c | 2 ++ arch/s390/kernel/machine_kexec_file.c | 10 ++++++++++ 3 files changed, 14 insertions(+) diff --git a/arch/s390/kernel/kexec_elf.c b/arch/s390/kernel/kexec_elf.c index 9da6fa30c44749..4d364de4379921 100644 --- a/arch/s390/kernel/kexec_elf.c +++ b/arch/s390/kernel/kexec_elf.c @@ -40,8 +40,10 @@ static int kexec_file_add_kernel_elf(struct kimage *image, buf.bufsz = phdr->p_filesz; buf.mem = ALIGN(phdr->p_paddr, phdr->p_align); +#ifdef CONFIG_CRASH_DUMP if (image->type == KEXEC_TYPE_CRASH) buf.mem += crashk_res.start; +#endif buf.memsz = phdr->p_memsz; data->memsz = ALIGN(data->memsz, phdr->p_align) + buf.memsz; diff --git a/arch/s390/kernel/kexec_image.c b/arch/s390/kernel/kexec_image.c index af23eff5774dba..a32ce8bea745cf 100644 --- a/arch/s390/kernel/kexec_image.c +++ b/arch/s390/kernel/kexec_image.c @@ -24,8 +24,10 @@ static int kexec_file_add_kernel_image(struct kimage *image, buf.bufsz = image->kernel_buf_len; buf.mem = 0; +#ifdef CONFIG_CRASH_DUMP if (image->type == KEXEC_TYPE_CRASH) buf.mem += crashk_res.start; +#endif buf.memsz = buf.bufsz; data->kernel_buf = image->kernel_buf; diff --git a/arch/s390/kernel/machine_kexec_file.c b/arch/s390/kernel/machine_kexec_file.c index 8d207b82d9fedd..c2bac14dd668ae 100644 --- a/arch/s390/kernel/machine_kexec_file.c +++ b/arch/s390/kernel/machine_kexec_file.c @@ -105,6 +105,7 @@ static int kexec_file_update_purgatory(struct kimage *image, if (ret) return ret; +#ifdef CONFIG_CRASH_DUMP if (image->type == KEXEC_TYPE_CRASH) { u64 crash_size; @@ -121,6 +122,7 @@ static int kexec_file_update_purgatory(struct kimage *image, sizeof(crash_size), false); } +#endif return ret; } @@ -134,8 +136,10 @@ static int kexec_file_add_purgatory(struct kimage *image, data->memsz = ALIGN(data->memsz, PAGE_SIZE); buf.mem = data->memsz; +#ifdef CONFIG_CRASH_DUMP if (image->type == KEXEC_TYPE_CRASH) buf.mem += crashk_res.start; +#endif ret = kexec_load_purgatory(image, &buf); if (ret) @@ -158,8 +162,10 @@ static int kexec_file_add_initrd(struct kimage *image, data->memsz = ALIGN(data->memsz, PAGE_SIZE); buf.mem = data->memsz; +#ifdef CONFIG_CRASH_DUMP if (image->type == KEXEC_TYPE_CRASH) buf.mem += crashk_res.start; +#endif buf.memsz = buf.bufsz; data->parm->initrd_start = data->memsz; @@ -223,8 +229,10 @@ static int kexec_file_add_ipl_report(struct kimage *image, data->kernel_buf + offsetof(struct lowcore, ipl_parmblock_ptr); *lc_ipl_parmblock_ptr = (__u32)buf.mem; +#ifdef CONFIG_CRASH_DUMP if (image->type == KEXEC_TYPE_CRASH) buf.mem += crashk_res.start; +#endif ret = kexec_add_buffer(&buf); out: @@ -268,10 +276,12 @@ void *kexec_file_add_components(struct kimage *image, memcpy(data.parm->command_line, image->cmdline_buf, image->cmdline_buf_len); +#ifdef CONFIG_CRASH_DUMP if (image->type == KEXEC_TYPE_CRASH) { data.parm->oldmem_base = crashk_res.start; data.parm->oldmem_size = crashk_res.end - crashk_res.start + 1; } +#endif if (image->initrd_buf) { ret = kexec_file_add_initrd(image, &data); From b4a07e34e15bf9b796a270932902f671cf0b4507 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Wed, 24 Jan 2024 13:12:50 +0800 Subject: [PATCH 481/707] sh, crash: wrap crash dumping code into crash related ifdefs Now crash codes under kernel/ folder has been split out from kexec code, crash dumping can be separated from kexec reboot in config items on SuperH with some adjustments. Wrap up crash dumping codes with CONFIG_CRASH_DUMP ifdeffery, and use IS_ENABLED(CONFIG_CRASH_RESERVE) check to decide if compiling in the crashkernel reservation code. Link: https://lkml.kernel.org/r/20240124051254.67105-11-bhe@redhat.com Signed-off-by: Baoquan He Cc: Al Viro Cc: Eric W. Biederman Cc: Hari Bathini Cc: Pingfan Liu Signed-off-by: Andrew Morton --- arch/sh/kernel/machine_kexec.c | 3 +++ arch/sh/kernel/setup.c | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/sh/kernel/machine_kexec.c b/arch/sh/kernel/machine_kexec.c index fa3a7b36190a2a..8daa8a6e6fa683 100644 --- a/arch/sh/kernel/machine_kexec.c +++ b/arch/sh/kernel/machine_kexec.c @@ -153,6 +153,9 @@ void __init reserve_crashkernel(void) unsigned long long crash_size, crash_base; int ret; + if (!IS_ENABLED(CONFIG_CRASH_RESERVE)) + return; + ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(), &crash_size, &crash_base, NULL, NULL); if (ret == 0 && crash_size > 0) { diff --git a/arch/sh/kernel/setup.c b/arch/sh/kernel/setup.c index d3175f09b3aad9..620e5cf8ae1e74 100644 --- a/arch/sh/kernel/setup.c +++ b/arch/sh/kernel/setup.c @@ -220,7 +220,7 @@ void __init __add_active_range(unsigned int nid, unsigned long start_pfn, request_resource(res, &code_resource); request_resource(res, &data_resource); request_resource(res, &bss_resource); -#ifdef CONFIG_KEXEC_CORE +#ifdef CONFIG_CRASH_RESERVE request_resource(res, &crashk_res); #endif From cda487c0a74a9160c1477e14d857b339c79732c5 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Wed, 24 Jan 2024 13:12:51 +0800 Subject: [PATCH 482/707] mips, crash: wrap crash dumping code into crash related ifdefs Now crash codes under kernel/ folder has been split out from kexec code, crash dumping can be separated from kexec reboot in config items on mips with some adjustments. Here use IS_ENABLED(CONFIG_CRASH_RESERVE) check to decide if compiling in the crashkernel reservation code. Link: https://lkml.kernel.org/r/20240124051254.67105-12-bhe@redhat.com Signed-off-by: Baoquan He Cc: Al Viro Cc: Eric W. Biederman Cc: Hari Bathini Cc: Pingfan Liu Signed-off-by: Andrew Morton --- arch/mips/kernel/setup.c | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/arch/mips/kernel/setup.c b/arch/mips/kernel/setup.c index 9c30de15159761..12a1a4ffb60211 100644 --- a/arch/mips/kernel/setup.c +++ b/arch/mips/kernel/setup.c @@ -442,8 +442,6 @@ static void __init mips_reserve_vmcore(void) #endif } -#ifdef CONFIG_KEXEC - /* 64M alignment for crash kernel regions */ #define CRASH_ALIGN SZ_64M #define CRASH_ADDR_MAX SZ_512M @@ -454,6 +452,9 @@ static void __init mips_parse_crashkernel(void) unsigned long long crash_size, crash_base; int ret; + if (!IS_ENABLED(CONFIG_CRASH_RESERVE)) + return; + total_mem = memblock_phys_mem_size(); ret = parse_crashkernel(boot_command_line, total_mem, &crash_size, &crash_base, @@ -489,6 +490,9 @@ static void __init request_crashkernel(struct resource *res) { int ret; + if (!IS_ENABLED(CONFIG_CRASH_RESERVE)) + return; + if (crashk_res.start == crashk_res.end) return; @@ -498,15 +502,6 @@ static void __init request_crashkernel(struct resource *res) (unsigned long)(resource_size(&crashk_res) >> 20), (unsigned long)(crashk_res.start >> 20)); } -#else /* !defined(CONFIG_KEXEC) */ -static void __init mips_parse_crashkernel(void) -{ -} - -static void __init request_crashkernel(struct resource *res) -{ -} -#endif /* !defined(CONFIG_KEXEC) */ static void __init check_kernel_sections_mem(void) { From 37281a1a4916e709401e728b5e7291d56b28e1fe Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Wed, 24 Jan 2024 13:12:52 +0800 Subject: [PATCH 483/707] riscv, crash: wrap crash dumping code into crash related ifdefs Now crash codes under kernel/ folder has been split out from kexec code, crash dumping can be separated from kexec reboot in config items on risc-v with some adjustments. Here wrap up crash dumping codes with CONFIG_CRASH_DUMP ifdeffery, and use IS_ENABLED(CONFIG_CRASH_RESERVE) check to decide if compiling in the crashkernel reservation code. Link: https://lkml.kernel.org/r/20240124051254.67105-13-bhe@redhat.com Signed-off-by: Baoquan He Cc: Al Viro Cc: Eric W. Biederman Cc: Hari Bathini Cc: Pingfan Liu Signed-off-by: Andrew Morton --- arch/riscv/kernel/elf_kexec.c | 9 +++++++-- arch/riscv/mm/init.c | 2 +- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/arch/riscv/kernel/elf_kexec.c b/arch/riscv/kernel/elf_kexec.c index 5bd1ec3341fe9c..54260c16f9912a 100644 --- a/arch/riscv/kernel/elf_kexec.c +++ b/arch/riscv/kernel/elf_kexec.c @@ -117,6 +117,7 @@ static int elf_find_pbase(struct kimage *image, unsigned long kernel_len, return ret; } +#ifdef CONFIG_CRASH_DUMP static int get_nr_ram_ranges_callback(struct resource *res, void *arg) { unsigned int *nr_ranges = arg; @@ -189,6 +190,7 @@ static char *setup_kdump_cmdline(struct kimage *image, char *cmdline, cmdline_ptr[COMMAND_LINE_SIZE - 1] = '\0'; return cmdline_ptr; } +#endif static void *elf_kexec_load(struct kimage *image, char *kernel_buf, unsigned long kernel_len, char *initrd, @@ -196,12 +198,11 @@ static void *elf_kexec_load(struct kimage *image, char *kernel_buf, unsigned long cmdline_len) { int ret; + void *fdt; unsigned long old_kernel_pbase = ULONG_MAX; unsigned long new_kernel_pbase = 0UL; unsigned long initrd_pbase = 0UL; - unsigned long headers_sz; unsigned long kernel_start; - void *fdt, *headers; struct elfhdr ehdr; struct kexec_buf kbuf; struct kexec_elf_info elf_info; @@ -227,8 +228,11 @@ static void *elf_kexec_load(struct kimage *image, char *kernel_buf, kbuf.buf_min = new_kernel_pbase + kernel_len; kbuf.buf_max = ULONG_MAX; +#ifdef CONFIG_CRASH_DUMP /* Add elfcorehdr */ if (image->type == KEXEC_TYPE_CRASH) { + void *headers; + unsigned long headers_sz; ret = prepare_elf_headers(&headers, &headers_sz); if (ret) { pr_err("Preparing elf core header failed\n"); @@ -264,6 +268,7 @@ static void *elf_kexec_load(struct kimage *image, char *kernel_buf, } cmdline = modified_cmdline; } +#endif #ifdef CONFIG_ARCH_SUPPORTS_KEXEC_PURGATORY /* Add purgatory to the image */ diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 32cad6a65ccd23..245919dda91043 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -1358,7 +1358,7 @@ static void __init arch_reserve_crashkernel(void) bool high = false; int ret; - if (!IS_ENABLED(CONFIG_KEXEC_CORE)) + if (!IS_ENABLED(CONFIG_CRASH_RESERVE)) return; ret = parse_crashkernel(cmdline, memblock_phys_mem_size(), From 41c5e0bec007b5910043563ca49d66f7a188d464 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Wed, 24 Jan 2024 13:12:53 +0800 Subject: [PATCH 484/707] arm, crash: wrap crash dumping code into crash related ifdefs Now crash codes under kernel/ folder has been split out from kexec code, crash dumping can be separated from kexec reboot in config items on arm with some adjustments. Here use CONFIG_CRASH_RESERVE ifdef to replace CONFIG_KEXEC ifdef. Link: https://lkml.kernel.org/r/20240124051254.67105-14-bhe@redhat.com Signed-off-by: Baoquan He Cc: Al Viro Cc: Eric W. Biederman Cc: Hari Bathini Cc: Pingfan Liu Signed-off-by: Andrew Morton --- arch/arm/kernel/setup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c index ff2299ce1ad7a3..7b33b157fca0dc 100644 --- a/arch/arm/kernel/setup.c +++ b/arch/arm/kernel/setup.c @@ -979,7 +979,7 @@ static int __init init_machine_late(void) } late_initcall(init_machine_late); -#ifdef CONFIG_KEXEC +#ifdef CONFIG_CRASH_RESERVE /* * The crash region must be aligned to 128MB to avoid * zImage relocating below the reserved region. @@ -1066,7 +1066,7 @@ static void __init reserve_crashkernel(void) } #else static inline void reserve_crashkernel(void) {} -#endif /* CONFIG_KEXEC */ +#endif /* CONFIG_CRASH_RESERVE*/ void __init hyp_mode_check(void) { From 57d6736e7a6eb47939977444770f74408a4f5732 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Wed, 24 Jan 2024 13:12:54 +0800 Subject: [PATCH 485/707] loongarch, crash: wrap crash dumping code into crash related ifdefs Now crash codes under kernel/ folder has been split out from kexec code, crash dumping can be separated from kexec reboot in config items on loongarch with some adjustments. Here use IS_ENABLED(CONFIG_CRASH_RESERVE) check to decide if compiling in the crashkernel reservation code. Link: https://lkml.kernel.org/r/20240124051254.67105-15-bhe@redhat.com Signed-off-by: Baoquan He Cc: Al Viro Cc: Eric W. Biederman Cc: Hari Bathini Cc: Pingfan Liu Signed-off-by: Andrew Morton --- arch/loongarch/kernel/setup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/loongarch/kernel/setup.c b/arch/loongarch/kernel/setup.c index edf2bba8013067..57d37dd9f964d3 100644 --- a/arch/loongarch/kernel/setup.c +++ b/arch/loongarch/kernel/setup.c @@ -260,7 +260,7 @@ static void __init arch_reserve_crashkernel(void) char *cmdline = boot_command_line; bool high = false; - if (!IS_ENABLED(CONFIG_KEXEC_CORE)) + if (!IS_ENABLED(CONFIG_CRASH_RESERVE)) return; ret = parse_crashkernel(cmdline, memblock_phys_mem_size(), From f73c7a5997426280402956c330928eef970a6571 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Fri, 19 Jan 2024 11:22:22 +0000 Subject: [PATCH 486/707] mm/zswap: make sure each swapfile always have zswap rb-tree Patch series "mm/zswap: optimize the scalability of zswap rb-tree", v2. When testing the zswap performance by using kernel build -j32 in a tmpfs directory, I found the scalability of zswap rb-tree is not good, which is protected by the only spinlock. That would cause heavy lock contention if multiple tasks zswap_store/load concurrently. So a simple solution is to split the only one zswap rb-tree into multiple rb-trees, each corresponds to SWAP_ADDRESS_SPACE_PAGES (64M). This idea is from the commit 4b3ef9daa4fc ("mm/swap: split swap cache into 64MB trunks"). Although this method can't solve the spinlock contention completely, it can mitigate much of that contention. Below is the results of kernel build in tmpfs with zswap shrinker enabled: linux-next zswap-lock-optimize real 1m9.181s 1m3.820s user 17m44.036s 17m40.100s sys 7m37.297s 4m54.622s So there are clearly improvements. And it's complementary with the ongoing zswap xarray conversion by Chris. Anyway, I think we can also merge this first, it's complementary IMHO. So I just refresh and resend this for further discussion. This patch (of 2): Not all zswap interfaces can handle the absence of the zswap rb-tree, actually only zswap_store() has handled it for now. To make things simple, we make sure each swapfile always have the zswap rb-tree prepared before being enabled and used. The preparation is unlikely to fail in practice, this patch just make it explicit. Link: https://lkml.kernel.org/r/20240117-b4-zswap-lock-optimize-v2-0-b5cc55479090@bytedance.com Link: https://lkml.kernel.org/r/20240117-b4-zswap-lock-optimize-v2-1-b5cc55479090@bytedance.com Signed-off-by: Chengming Zhou Acked-by: Nhat Pham Acked-by: Johannes Weiner Acked-by: Yosry Ahmed Cc: Chris Li Signed-off-by: Andrew Morton --- include/linux/zswap.h | 7 +++++-- mm/swapfile.c | 10 +++++++--- mm/zswap.c | 7 ++++--- 3 files changed, 16 insertions(+), 8 deletions(-) diff --git a/include/linux/zswap.h b/include/linux/zswap.h index 0b709f5bc65fac..eca388229d9a76 100644 --- a/include/linux/zswap.h +++ b/include/linux/zswap.h @@ -30,7 +30,7 @@ struct zswap_lruvec_state { bool zswap_store(struct folio *folio); bool zswap_load(struct folio *folio); void zswap_invalidate(int type, pgoff_t offset); -void zswap_swapon(int type); +int zswap_swapon(int type); void zswap_swapoff(int type); void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg); void zswap_lruvec_state_init(struct lruvec *lruvec); @@ -51,7 +51,10 @@ static inline bool zswap_load(struct folio *folio) } static inline void zswap_invalidate(int type, pgoff_t offset) {} -static inline void zswap_swapon(int type) {} +static inline int zswap_swapon(int type) +{ + return 0; +} static inline void zswap_swapoff(int type) {} static inline void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg) {} static inline void zswap_lruvec_state_init(struct lruvec *lruvec) {} diff --git a/mm/swapfile.c b/mm/swapfile.c index 556ff7347d5f04..726a2c4d185217 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2348,8 +2348,6 @@ static void enable_swap_info(struct swap_info_struct *p, int prio, unsigned char *swap_map, struct swap_cluster_info *cluster_info) { - zswap_swapon(p->type); - spin_lock(&swap_lock); spin_lock(&p->lock); setup_swap_info(p, prio, swap_map, cluster_info); @@ -3167,6 +3165,10 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) if (error) goto bad_swap_unlock_inode; + error = zswap_swapon(p->type); + if (error) + goto free_swap_address_space; + /* * Flush any pending IO and dirty mappings before we start using this * swap device. @@ -3175,7 +3177,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) error = inode_drain_writes(inode); if (error) { inode->i_flags &= ~S_SWAPFILE; - goto free_swap_address_space; + goto free_swap_zswap; } mutex_lock(&swapon_mutex); @@ -3199,6 +3201,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) error = 0; goto out; +free_swap_zswap: + zswap_swapoff(p->type); free_swap_address_space: exit_swap_address_space(p->type); bad_swap_unlock_inode: diff --git a/mm/zswap.c b/mm/zswap.c index e7b38aefb9afe7..3901a2445b1578 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1519,7 +1519,7 @@ bool zswap_store(struct folio *folio) if (folio_test_large(folio)) return false; - if (!zswap_enabled || !tree) + if (!zswap_enabled) return false; /* @@ -1772,19 +1772,20 @@ void zswap_invalidate(int type, pgoff_t offset) spin_unlock(&tree->lock); } -void zswap_swapon(int type) +int zswap_swapon(int type) { struct zswap_tree *tree; tree = kzalloc(sizeof(*tree), GFP_KERNEL); if (!tree) { pr_err("alloc failed, zswap disabled for swap type %d\n", type); - return; + return -ENOMEM; } tree->rbroot = RB_ROOT; spin_lock_init(&tree->lock); zswap_trees[type] = tree; + return 0; } void zswap_swapoff(int type) From 4d6262db26280c70b122008f5da894342786ca14 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Fri, 19 Jan 2024 11:22:23 +0000 Subject: [PATCH 487/707] mm/zswap: split zswap rb-tree Each swapfile has one rb-tree to search the mapping of swp_entry_t to zswap_entry, that use a spinlock to protect, which can cause heavy lock contention if multiple tasks zswap_store/load concurrently. Optimize the scalability problem by splitting the zswap rb-tree into multiple rb-trees, each corresponds to SWAP_ADDRESS_SPACE_PAGES (64M), just like we did in the swap cache address_space splitting. Although this method can't solve the spinlock contention completely, it can mitigate much of that contention. Below is the results of kernel build in tmpfs with zswap shrinker enabled: linux-next zswap-lock-optimize real 1m9.181s 1m3.820s user 17m44.036s 17m40.100s sys 7m37.297s 4m54.622s So there are clearly improvements. Link: https://lkml.kernel.org/r/20240117-b4-zswap-lock-optimize-v2-2-b5cc55479090@bytedance.com Signed-off-by: Chengming Zhou Acked-by: Johannes Weiner Acked-by: Nhat Pham Acked-by: Yosry Ahmed Cc: Chris Li Signed-off-by: Andrew Morton --- include/linux/zswap.h | 4 +-- mm/swapfile.c | 2 +- mm/zswap.c | 71 ++++++++++++++++++++++++++++--------------- 3 files changed, 49 insertions(+), 28 deletions(-) diff --git a/include/linux/zswap.h b/include/linux/zswap.h index eca388229d9a76..91895ce1fdbc4f 100644 --- a/include/linux/zswap.h +++ b/include/linux/zswap.h @@ -30,7 +30,7 @@ struct zswap_lruvec_state { bool zswap_store(struct folio *folio); bool zswap_load(struct folio *folio); void zswap_invalidate(int type, pgoff_t offset); -int zswap_swapon(int type); +int zswap_swapon(int type, unsigned long nr_pages); void zswap_swapoff(int type); void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg); void zswap_lruvec_state_init(struct lruvec *lruvec); @@ -51,7 +51,7 @@ static inline bool zswap_load(struct folio *folio) } static inline void zswap_invalidate(int type, pgoff_t offset) {} -static inline int zswap_swapon(int type) +static inline int zswap_swapon(int type, unsigned long nr_pages) { return 0; } diff --git a/mm/swapfile.c b/mm/swapfile.c index 726a2c4d185217..b11b6057d8b5fb 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -3165,7 +3165,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) if (error) goto bad_swap_unlock_inode; - error = zswap_swapon(p->type); + error = zswap_swapon(p->type, maxpages); if (error) goto free_swap_address_space; diff --git a/mm/zswap.c b/mm/zswap.c index 3901a2445b1578..1a864c8bc081bc 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -239,6 +239,7 @@ struct zswap_tree { }; static struct zswap_tree *zswap_trees[MAX_SWAPFILES]; +static unsigned int nr_zswap_trees[MAX_SWAPFILES]; /* RCU-protected iteration */ static LIST_HEAD(zswap_pools); @@ -265,6 +266,12 @@ static bool zswap_has_pool; * helpers and fwd declarations **********************************/ +static inline struct zswap_tree *swap_zswap_tree(swp_entry_t swp) +{ + return &zswap_trees[swp_type(swp)][swp_offset(swp) + >> SWAP_ADDRESS_SPACE_SHIFT]; +} + #define zswap_pool_debug(msg, p) \ pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name, \ zpool_get_type((p)->zpools[0])) @@ -865,7 +872,7 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o * until the entry is verified to still be alive in the tree. */ swpoffset = swp_offset(entry->swpentry); - tree = zswap_trees[swp_type(entry->swpentry)]; + tree = swap_zswap_tree(entry->swpentry); list_lru_isolate(l, item); /* * It's safe to drop the lock here because we return either @@ -1494,10 +1501,9 @@ static void zswap_fill_page(void *ptr, unsigned long value) bool zswap_store(struct folio *folio) { swp_entry_t swp = folio->swap; - int type = swp_type(swp); pgoff_t offset = swp_offset(swp); struct page *page = &folio->page; - struct zswap_tree *tree = zswap_trees[type]; + struct zswap_tree *tree = swap_zswap_tree(swp); struct zswap_entry *entry, *dupentry; struct scatterlist input, output; struct crypto_acomp_ctx *acomp_ctx; @@ -1569,7 +1575,7 @@ bool zswap_store(struct folio *folio) src = kmap_local_page(page); if (zswap_is_page_same_filled(src, &value)) { kunmap_local(src); - entry->swpentry = swp_entry(type, offset); + entry->swpentry = swp; entry->length = 0; entry->value = value; atomic_inc(&zswap_same_filled_pages); @@ -1651,7 +1657,7 @@ bool zswap_store(struct folio *folio) mutex_unlock(&acomp_ctx->mutex); /* populate entry */ - entry->swpentry = swp_entry(type, offset); + entry->swpentry = swp; entry->handle = handle; entry->length = dlen; @@ -1711,10 +1717,9 @@ bool zswap_store(struct folio *folio) bool zswap_load(struct folio *folio) { swp_entry_t swp = folio->swap; - int type = swp_type(swp); pgoff_t offset = swp_offset(swp); struct page *page = &folio->page; - struct zswap_tree *tree = zswap_trees[type]; + struct zswap_tree *tree = swap_zswap_tree(swp); struct zswap_entry *entry; u8 *dst; @@ -1757,7 +1762,7 @@ bool zswap_load(struct folio *folio) void zswap_invalidate(int type, pgoff_t offset) { - struct zswap_tree *tree = zswap_trees[type]; + struct zswap_tree *tree = swap_zswap_tree(swp_entry(type, offset)); struct zswap_entry *entry; /* find */ @@ -1772,37 +1777,53 @@ void zswap_invalidate(int type, pgoff_t offset) spin_unlock(&tree->lock); } -int zswap_swapon(int type) +int zswap_swapon(int type, unsigned long nr_pages) { - struct zswap_tree *tree; + struct zswap_tree *trees, *tree; + unsigned int nr, i; - tree = kzalloc(sizeof(*tree), GFP_KERNEL); - if (!tree) { + nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES); + trees = kvcalloc(nr, sizeof(*tree), GFP_KERNEL); + if (!trees) { pr_err("alloc failed, zswap disabled for swap type %d\n", type); return -ENOMEM; } - tree->rbroot = RB_ROOT; - spin_lock_init(&tree->lock); - zswap_trees[type] = tree; + for (i = 0; i < nr; i++) { + tree = trees + i; + tree->rbroot = RB_ROOT; + spin_lock_init(&tree->lock); + } + + nr_zswap_trees[type] = nr; + zswap_trees[type] = trees; return 0; } void zswap_swapoff(int type) { - struct zswap_tree *tree = zswap_trees[type]; - struct zswap_entry *entry, *n; + struct zswap_tree *trees = zswap_trees[type]; + unsigned int i; - if (!tree) + if (!trees) return; - /* walk the tree and free everything */ - spin_lock(&tree->lock); - rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode) - zswap_free_entry(entry); - tree->rbroot = RB_ROOT; - spin_unlock(&tree->lock); - kfree(tree); + for (i = 0; i < nr_zswap_trees[type]; i++) { + struct zswap_tree *tree = trees + i; + struct zswap_entry *entry, *n; + + /* walk the tree and free everything */ + spin_lock(&tree->lock); + rbtree_postorder_for_each_entry_safe(entry, n, + &tree->rbroot, + rbnode) + zswap_free_entry(entry); + tree->rbroot = RB_ROOT; + spin_unlock(&tree->lock); + } + + kvfree(trees); + nr_zswap_trees[type] = 0; zswap_trees[type] = NULL; } From e6a1ca66392f9af26cafb428123fe066c826b06c Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Wed, 24 Jan 2024 04:51:11 +0000 Subject: [PATCH 488/707] mm: swap: enforce updating inuse_pages at the end of swap_range_free() Patch series "mm: zswap: simplify zswap_swapoff()", v2. These patches aim to simplify zswap_swapoff() by removing the unnecessary trees cleanup code. Patch 1 makes sure that the order of operations during swapoff is enforced correctly, making sure the simplification in patch 2 is correct in a future-proof manner. This patch (of 2): In swap_range_free(), we update inuse_pages then do some cleanups (arch invalidation, zswap invalidation, swap cache cleanups, etc). During swapoff, try_to_unuse() checks that inuse_pages is 0 to make sure all swap entries are freed. Make sure we only update inuse_pages after we are done with the cleanups in swap_range_free(), and use the proper memory barriers to enforce it. This makes sure that code following try_to_unuse() can safely assume that swap_range_free() ran for all entries in thr swapfile (e.g. swap cache cleanup, zswap_swapoff()). In practice, this currently isn't a problem because swap_range_free() is called with the swap info lock held, and the swapoff code happens to spin for that after try_to_unuse(). However, this seems fragile and unintentional, so make it more relable and future-proof. This also facilitates a following simplification of zswap_swapoff(). Link: https://lkml.kernel.org/r/20240124045113.415378-1-yosryahmed@google.com Link: https://lkml.kernel.org/r/20240124045113.415378-2-yosryahmed@google.com Signed-off-by: Yosry Ahmed Reviewed-by: "Huang, Ying" Cc: Chengming Zhou Cc: Chris Li Cc: Johannes Weiner Cc: Nhat Pham Signed-off-by: Andrew Morton --- mm/swapfile.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index b11b6057d8b5fb..0580bb3e34d773 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -737,8 +737,6 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset, if (was_full && (si->flags & SWP_WRITEOK)) add_to_avail_list(si); } - atomic_long_add(nr_entries, &nr_swap_pages); - WRITE_ONCE(si->inuse_pages, si->inuse_pages - nr_entries); if (si->flags & SWP_BLKDEV) swap_slot_free_notify = si->bdev->bd_disk->fops->swap_slot_free_notify; @@ -752,6 +750,14 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset, offset++; } clear_shadow_from_swap_cache(si->type, begin, end); + + /* + * Make sure that try_to_unuse() observes si->inuse_pages reaching 0 + * only after the above cleanups are done. + */ + smp_wmb(); + atomic_long_add(nr_entries, &nr_swap_pages); + WRITE_ONCE(si->inuse_pages, si->inuse_pages - nr_entries); } static void set_cluster_next(struct swap_info_struct *si, unsigned long next) @@ -2049,7 +2055,7 @@ static int try_to_unuse(unsigned int type) unsigned int i; if (!READ_ONCE(si->inuse_pages)) - return 0; + goto success; retry: retval = shmem_unuse(type); @@ -2130,6 +2136,12 @@ static int try_to_unuse(unsigned int type) return -EINTR; } +success: + /* + * Make sure that further cleanups after try_to_unuse() returns happen + * after swap_range_free() reduces si->inuse_pages to 0. + */ + smp_mb(); return 0; } From 008a70daf58254538a14b42d6c144997c980ad90 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Wed, 24 Jan 2024 04:51:12 +0000 Subject: [PATCH 489/707] mm: zswap: remove unnecessary trees cleanups in zswap_swapoff() During swapoff, try_to_unuse() makes sure that zswap_invalidate() is called for all swap entries before zswap_swapoff() is called. This means that all zswap entries should already be removed from the tree. Simplify zswap_swapoff() by removing the trees cleanup code, and leave an assertion in its place. Link: https://lkml.kernel.org/r/20240124045113.415378-3-yosryahmed@google.com Signed-off-by: Yosry Ahmed Reviewed-by: Chengming Zhou Cc: Chris Li Cc: "Huang, Ying" Cc: Johannes Weiner Cc: Nhat Pham Signed-off-by: Andrew Morton --- mm/zswap.c | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 1a864c8bc081bc..b5638fb39ff622 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1808,19 +1808,9 @@ void zswap_swapoff(int type) if (!trees) return; - for (i = 0; i < nr_zswap_trees[type]; i++) { - struct zswap_tree *tree = trees + i; - struct zswap_entry *entry, *n; - - /* walk the tree and free everything */ - spin_lock(&tree->lock); - rbtree_postorder_for_each_entry_safe(entry, n, - &tree->rbroot, - rbnode) - zswap_free_entry(entry); - tree->rbroot = RB_ROOT; - spin_unlock(&tree->lock); - } + /* try_to_unuse() invalidated all the entries already */ + for (i = 0; i < nr_zswap_trees[type]; i++) + WARN_ON_ONCE(!RB_EMPTY_ROOT(&trees[i].rbroot)); kvfree(trees); nr_zswap_trees[type] = 0; From ce3883bf8e64d3bfded2c41e9bfbfba8236e9ffa Mon Sep 17 00:00:00 2001 From: Yajun Deng Date: Wed, 24 Jan 2024 11:57:19 +0800 Subject: [PATCH 490/707] mm/mmap: introduce vma_set_range() There is a lot of code needs to set the range of vma in mmap.c, introduce vma_set_range() to simplify the code. Link: https://lkml.kernel.org/r/20240124035719.3685193-1-yajun.deng@linux.dev Signed-off-by: Yajun Deng Reviewed-by: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/internal.h | 9 +++++++++ mm/mmap.c | 29 +++++++---------------------- 2 files changed, 16 insertions(+), 22 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index f309a010d50fb6..1e29c5821a1dde 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1114,6 +1114,15 @@ static inline bool gup_must_unshare(struct vm_area_struct *vma, extern bool mirrored_kernelcore; extern bool memblock_has_mirror(void); +static __always_inline void vma_set_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end, + pgoff_t pgoff) +{ + vma->vm_start = start; + vma->vm_end = end; + vma->vm_pgoff = pgoff; +} + static inline bool vma_soft_dirty_enabled(struct vm_area_struct *vma) { /* diff --git a/mm/mmap.c b/mm/mmap.c index 66f534ec90a55e..476de5daf598d1 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -663,9 +663,7 @@ int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, vma_prepare(&vp); vma_adjust_trans_huge(vma, start, end, 0); - vma->vm_start = start; - vma->vm_end = end; - vma->vm_pgoff = pgoff; + vma_set_range(vma, start, end, pgoff); vma_iter_store(vmi, vma); vma_complete(&vp, vmi, vma->vm_mm); @@ -708,9 +706,7 @@ int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma, vma_adjust_trans_huge(vma, start, end, 0); vma_iter_clear(vmi); - vma->vm_start = start; - vma->vm_end = end; - vma->vm_pgoff = pgoff; + vma_set_range(vma, start, end, pgoff); vma_complete(&vp, vmi, vma->vm_mm); return 0; } @@ -1015,10 +1011,7 @@ static struct vm_area_struct vma_prepare(&vp); vma_adjust_trans_huge(vma, vma_start, vma_end, adj_start); - - vma->vm_start = vma_start; - vma->vm_end = vma_end; - vma->vm_pgoff = vma_pgoff; + vma_set_range(vma, vma_start, vma_end, vma_pgoff); if (vma_expanded) vma_iter_store(vmi, vma); @@ -2811,11 +2804,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr, } vma_iter_config(&vmi, addr, end); - vma->vm_start = addr; - vma->vm_end = end; + vma_set_range(vma, addr, end, pgoff); vm_flags_init(vma, vm_flags); vma->vm_page_prot = vm_get_page_prot(vm_flags); - vma->vm_pgoff = pgoff; if (file) { vma->vm_file = get_file(file); @@ -3165,9 +3156,7 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, goto unacct_fail; vma_set_anonymous(vma); - vma->vm_start = addr; - vma->vm_end = addr + len; - vma->vm_pgoff = addr >> PAGE_SHIFT; + vma_set_range(vma, addr, addr + len, addr >> PAGE_SHIFT); vm_flags_init(vma, flags); vma->vm_page_prot = vm_get_page_prot(flags); vma_start_write(vma); @@ -3404,9 +3393,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, new_vma = vm_area_dup(vma); if (!new_vma) goto out; - new_vma->vm_start = addr; - new_vma->vm_end = addr + len; - new_vma->vm_pgoff = pgoff; + vma_set_range(new_vma, addr, addr + len, pgoff); if (vma_dup_policy(vma, new_vma)) goto out_free_vma; if (anon_vma_clone(new_vma, vma)) @@ -3574,9 +3561,7 @@ static struct vm_area_struct *__install_special_mapping( if (unlikely(vma == NULL)) return ERR_PTR(-ENOMEM); - vma->vm_start = addr; - vma->vm_end = addr + len; - + vma_set_range(vma, addr, addr + len, 0); vm_flags_init(vma, (vm_flags | mm->def_flags | VM_DONTEXPAND | VM_SOFTDIRTY) & ~VM_LOCKED_MASK); vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); From 88889777af485e0d0cccb6e84713d7ec997cc1eb Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Thu, 25 Jan 2024 08:14:23 +0000 Subject: [PATCH 491/707] mm: zswap: remove unused tree argument in zswap_entry_put() Commit 7310895779624 ("mm: zswap: tighten up entry invalidation") removed the usage of tree argument, delete it. Link: https://lkml.kernel.org/r/20240125081423.1200336-1-yosryahmed@google.com Signed-off-by: Yosry Ahmed Reviewed-by: Chengming Zhou Acked-by: Johannes Weiner Reviewed-by: Nhat Pham Signed-off-by: Andrew Morton --- mm/zswap.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index b5638fb39ff622..50a4af3f185607 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -569,8 +569,7 @@ static void zswap_entry_get(struct zswap_entry *entry) /* caller must hold the tree lock * remove from the tree and free it, if nobody reference the entry */ -static void zswap_entry_put(struct zswap_tree *tree, - struct zswap_entry *entry) +static void zswap_entry_put(struct zswap_entry *entry) { int refcount = --entry->refcount; @@ -853,7 +852,7 @@ static void zswap_invalidate_entry(struct zswap_tree *tree, struct zswap_entry *entry) { if (zswap_rb_erase(&tree->rbroot, entry)) - zswap_entry_put(tree, entry); + zswap_entry_put(entry); } static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l, @@ -922,7 +921,7 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o put_unlock: /* Drop local reference */ - zswap_entry_put(tree, entry); + zswap_entry_put(entry); unlock: spin_unlock(&tree->lock); spin_lock(lock); @@ -1754,7 +1753,7 @@ bool zswap_load(struct folio *folio) zswap_lru_del(&entry->pool->list_lru, entry); zswap_lru_add(&entry->pool->list_lru, entry); } - zswap_entry_put(tree, entry); + zswap_entry_put(entry); spin_unlock(&tree->lock); return true; From edcadaf023d516cf087d065b9921a4098459b9cb Mon Sep 17 00:00:00 2001 From: Vishal Verma Date: Wed, 24 Jan 2024 12:03:46 -0800 Subject: [PATCH 492/707] dax/bus.c: replace driver-core lock usage by a local rwsem Patch series "Add DAX ABI for memmap_on_memory", v7. This series adds sysfs ABI to control memmap_on_memory behavior for DAX devices. Patch 1 replaces incorrect device_lock() usage with a local rwsem - this was identified during review. Patch 2 is also a preparatory patch that replaces sprintf() for sysfs operations with sysfs_emit() Patch 3 adds the missing documentation for the sysfs ABI for DAX regions and Dax devices. Patch 4 exports mhp_supports_memmap_on_memory(). Patch 5 adds the new ABI for toggling memmap_on_memory semantics for dax devices. This patch (of 5): The dax driver incorrectly used driver-core device locks to protect internal dax region and dax device configuration structures. Replace the device lock usage with a local rwsem, one each for dax region configuration and dax device configuration. As a result of this conversion, no device_lock() usage remains in dax/bus.c. Link: https://lkml.kernel.org/r/20240124-vv-dax_abi-v7-0-20d16cb8d23d@intel.com Link: https://lkml.kernel.org/r/20240124-vv-dax_abi-v7-1-20d16cb8d23d@intel.com Signed-off-by: Vishal Verma Reported-by: Greg Kroah-Hartman Reviewed-by: Alison Schofield Cc: Dan Williams Cc: Dave Hansen Cc: Dave Jiang Cc: David Hildenbrand Cc: Huang Ying Cc: Jonathan Cameron Cc: Li Zhijian Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Oscar Salvador Signed-off-by: Andrew Morton --- drivers/dax/bus.c | 218 +++++++++++++++++++++++++++++++++------------- 1 file changed, 156 insertions(+), 62 deletions(-) diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c index 1ff1ab5fa105a6..cb148f74ceda67 100644 --- a/drivers/dax/bus.c +++ b/drivers/dax/bus.c @@ -12,6 +12,18 @@ static DEFINE_MUTEX(dax_bus_lock); +/* + * All changes to the dax region configuration occur with this lock held + * for write. + */ +DECLARE_RWSEM(dax_region_rwsem); + +/* + * All changes to the dax device configuration occur with this lock held + * for write. + */ +DECLARE_RWSEM(dax_dev_rwsem); + #define DAX_NAME_LEN 30 struct dax_id { struct list_head list; @@ -180,7 +192,7 @@ static u64 dev_dax_size(struct dev_dax *dev_dax) u64 size = 0; int i; - device_lock_assert(&dev_dax->dev); + WARN_ON_ONCE(!rwsem_is_locked(&dax_dev_rwsem)); for (i = 0; i < dev_dax->nr_range; i++) size += range_len(&dev_dax->ranges[i].range); @@ -194,8 +206,15 @@ static int dax_bus_probe(struct device *dev) struct dev_dax *dev_dax = to_dev_dax(dev); struct dax_region *dax_region = dev_dax->region; int rc; + u64 size; - if (dev_dax_size(dev_dax) == 0 || dev_dax->id < 0) + rc = down_read_interruptible(&dax_dev_rwsem); + if (rc) + return rc; + size = dev_dax_size(dev_dax); + up_read(&dax_dev_rwsem); + + if (size == 0 || dev_dax->id < 0) return -ENXIO; rc = dax_drv->probe(dev_dax); @@ -283,7 +302,7 @@ static unsigned long long dax_region_avail_size(struct dax_region *dax_region) resource_size_t size = resource_size(&dax_region->res); struct resource *res; - device_lock_assert(dax_region->dev); + WARN_ON_ONCE(!rwsem_is_locked(&dax_region_rwsem)); for_each_dax_region_resource(dax_region, res) size -= resource_size(res); @@ -295,10 +314,13 @@ static ssize_t available_size_show(struct device *dev, { struct dax_region *dax_region = dev_get_drvdata(dev); unsigned long long size; + int rc; - device_lock(dev); + rc = down_read_interruptible(&dax_region_rwsem); + if (rc) + return rc; size = dax_region_avail_size(dax_region); - device_unlock(dev); + up_read(&dax_region_rwsem); return sprintf(buf, "%llu\n", size); } @@ -314,10 +336,12 @@ static ssize_t seed_show(struct device *dev, if (is_static(dax_region)) return -EINVAL; - device_lock(dev); + rc = down_read_interruptible(&dax_region_rwsem); + if (rc) + return rc; seed = dax_region->seed; rc = sprintf(buf, "%s\n", seed ? dev_name(seed) : ""); - device_unlock(dev); + up_read(&dax_region_rwsem); return rc; } @@ -333,14 +357,18 @@ static ssize_t create_show(struct device *dev, if (is_static(dax_region)) return -EINVAL; - device_lock(dev); + rc = down_read_interruptible(&dax_region_rwsem); + if (rc) + return rc; youngest = dax_region->youngest; rc = sprintf(buf, "%s\n", youngest ? dev_name(youngest) : ""); - device_unlock(dev); + up_read(&dax_region_rwsem); return rc; } +static struct dev_dax *__devm_create_dev_dax(struct dev_dax_data *data); + static ssize_t create_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { @@ -358,7 +386,9 @@ static ssize_t create_store(struct device *dev, struct device_attribute *attr, if (val != 1) return -EINVAL; - device_lock(dev); + rc = down_write_killable(&dax_region_rwsem); + if (rc) + return rc; avail = dax_region_avail_size(dax_region); if (avail == 0) rc = -ENOSPC; @@ -369,7 +399,7 @@ static ssize_t create_store(struct device *dev, struct device_attribute *attr, .id = -1, .memmap_on_memory = false, }; - struct dev_dax *dev_dax = devm_create_dev_dax(&data); + struct dev_dax *dev_dax = __devm_create_dev_dax(&data); if (IS_ERR(dev_dax)) rc = PTR_ERR(dev_dax); @@ -387,7 +417,7 @@ static ssize_t create_store(struct device *dev, struct device_attribute *attr, rc = len; } } - device_unlock(dev); + up_write(&dax_region_rwsem); return rc; } @@ -417,7 +447,7 @@ static void trim_dev_dax_range(struct dev_dax *dev_dax) struct range *range = &dev_dax->ranges[i].range; struct dax_region *dax_region = dev_dax->region; - device_lock_assert(dax_region->dev); + WARN_ON_ONCE(!rwsem_is_locked(&dax_region_rwsem)); dev_dbg(&dev_dax->dev, "delete range[%d]: %#llx:%#llx\n", i, (unsigned long long)range->start, (unsigned long long)range->end); @@ -435,7 +465,7 @@ static void free_dev_dax_ranges(struct dev_dax *dev_dax) trim_dev_dax_range(dev_dax); } -static void unregister_dev_dax(void *dev) +static void __unregister_dev_dax(void *dev) { struct dev_dax *dev_dax = to_dev_dax(dev); @@ -447,6 +477,17 @@ static void unregister_dev_dax(void *dev) put_device(dev); } +static void unregister_dev_dax(void *dev) +{ + if (rwsem_is_locked(&dax_region_rwsem)) + return __unregister_dev_dax(dev); + + if (WARN_ON_ONCE(down_write_killable(&dax_region_rwsem) != 0)) + return; + __unregister_dev_dax(dev); + up_write(&dax_region_rwsem); +} + static void dax_region_free(struct kref *kref) { struct dax_region *dax_region; @@ -463,11 +504,10 @@ static void dax_region_put(struct dax_region *dax_region) /* a return value >= 0 indicates this invocation invalidated the id */ static int __free_dev_dax_id(struct dev_dax *dev_dax) { - struct device *dev = &dev_dax->dev; struct dax_region *dax_region; int rc = dev_dax->id; - device_lock_assert(dev); + WARN_ON_ONCE(!rwsem_is_locked(&dax_dev_rwsem)); if (!dev_dax->dyn_id || dev_dax->id < 0) return -1; @@ -480,12 +520,13 @@ static int __free_dev_dax_id(struct dev_dax *dev_dax) static int free_dev_dax_id(struct dev_dax *dev_dax) { - struct device *dev = &dev_dax->dev; int rc; - device_lock(dev); + rc = down_write_killable(&dax_dev_rwsem); + if (rc) + return rc; rc = __free_dev_dax_id(dev_dax); - device_unlock(dev); + up_write(&dax_dev_rwsem); return rc; } @@ -519,8 +560,14 @@ static ssize_t delete_store(struct device *dev, struct device_attribute *attr, if (!victim) return -ENXIO; - device_lock(dev); - device_lock(victim); + rc = down_write_killable(&dax_region_rwsem); + if (rc) + return rc; + rc = down_write_killable(&dax_dev_rwsem); + if (rc) { + up_write(&dax_region_rwsem); + return rc; + } dev_dax = to_dev_dax(victim); if (victim->driver || dev_dax_size(dev_dax)) rc = -EBUSY; @@ -541,12 +588,12 @@ static ssize_t delete_store(struct device *dev, struct device_attribute *attr, } else rc = -EBUSY; } - device_unlock(victim); + up_write(&dax_dev_rwsem); /* won the race to invalidate the device, clean it up */ if (do_del) devm_release_action(dev, unregister_dev_dax, victim); - device_unlock(dev); + up_write(&dax_region_rwsem); put_device(victim); return rc; @@ -658,16 +705,15 @@ static void dax_mapping_release(struct device *dev) put_device(parent); } -static void unregister_dax_mapping(void *data) +static void __unregister_dax_mapping(void *data) { struct device *dev = data; struct dax_mapping *mapping = to_dax_mapping(dev); struct dev_dax *dev_dax = to_dev_dax(dev->parent); - struct dax_region *dax_region = dev_dax->region; dev_dbg(dev, "%s\n", __func__); - device_lock_assert(dax_region->dev); + WARN_ON_ONCE(!rwsem_is_locked(&dax_region_rwsem)); dev_dax->ranges[mapping->range_id].mapping = NULL; mapping->range_id = -1; @@ -675,28 +721,37 @@ static void unregister_dax_mapping(void *data) device_unregister(dev); } +static void unregister_dax_mapping(void *data) +{ + if (rwsem_is_locked(&dax_region_rwsem)) + return __unregister_dax_mapping(data); + + if (WARN_ON_ONCE(down_write_killable(&dax_region_rwsem) != 0)) + return; + __unregister_dax_mapping(data); + up_write(&dax_region_rwsem); +} + static struct dev_dax_range *get_dax_range(struct device *dev) { struct dax_mapping *mapping = to_dax_mapping(dev); struct dev_dax *dev_dax = to_dev_dax(dev->parent); - struct dax_region *dax_region = dev_dax->region; + int rc; - device_lock(dax_region->dev); + rc = down_write_killable(&dax_region_rwsem); + if (rc) + return NULL; if (mapping->range_id < 0) { - device_unlock(dax_region->dev); + up_write(&dax_region_rwsem); return NULL; } return &dev_dax->ranges[mapping->range_id]; } -static void put_dax_range(struct dev_dax_range *dax_range) +static void put_dax_range(void) { - struct dax_mapping *mapping = dax_range->mapping; - struct dev_dax *dev_dax = to_dev_dax(mapping->dev.parent); - struct dax_region *dax_region = dev_dax->region; - - device_unlock(dax_region->dev); + up_write(&dax_region_rwsem); } static ssize_t start_show(struct device *dev, @@ -709,7 +764,7 @@ static ssize_t start_show(struct device *dev, if (!dax_range) return -ENXIO; rc = sprintf(buf, "%#llx\n", dax_range->range.start); - put_dax_range(dax_range); + put_dax_range(); return rc; } @@ -725,7 +780,7 @@ static ssize_t end_show(struct device *dev, if (!dax_range) return -ENXIO; rc = sprintf(buf, "%#llx\n", dax_range->range.end); - put_dax_range(dax_range); + put_dax_range(); return rc; } @@ -741,7 +796,7 @@ static ssize_t pgoff_show(struct device *dev, if (!dax_range) return -ENXIO; rc = sprintf(buf, "%#lx\n", dax_range->pgoff); - put_dax_range(dax_range); + put_dax_range(); return rc; } @@ -775,7 +830,7 @@ static int devm_register_dax_mapping(struct dev_dax *dev_dax, int range_id) struct device *dev; int rc; - device_lock_assert(dax_region->dev); + WARN_ON_ONCE(!rwsem_is_locked(&dax_region_rwsem)); if (dev_WARN_ONCE(&dev_dax->dev, !dax_region->dev->driver, "region disabled\n")) @@ -821,7 +876,7 @@ static int alloc_dev_dax_range(struct dev_dax *dev_dax, u64 start, struct resource *alloc; int i, rc; - device_lock_assert(dax_region->dev); + WARN_ON_ONCE(!rwsem_is_locked(&dax_region_rwsem)); /* handle the seed alloc special case */ if (!size) { @@ -875,13 +930,12 @@ static int adjust_dev_dax_range(struct dev_dax *dev_dax, struct resource *res, r { int last_range = dev_dax->nr_range - 1; struct dev_dax_range *dax_range = &dev_dax->ranges[last_range]; - struct dax_region *dax_region = dev_dax->region; bool is_shrink = resource_size(res) > size; struct range *range = &dax_range->range; struct device *dev = &dev_dax->dev; int rc; - device_lock_assert(dax_region->dev); + WARN_ON_ONCE(!rwsem_is_locked(&dax_region_rwsem)); if (dev_WARN_ONCE(dev, !size, "deletion is handled by dev_dax_shrink\n")) return -EINVAL; @@ -907,10 +961,13 @@ static ssize_t size_show(struct device *dev, { struct dev_dax *dev_dax = to_dev_dax(dev); unsigned long long size; + int rc; - device_lock(dev); + rc = down_write_killable(&dax_dev_rwsem); + if (rc) + return rc; size = dev_dax_size(dev_dax); - device_unlock(dev); + up_write(&dax_dev_rwsem); return sprintf(buf, "%llu\n", size); } @@ -1080,17 +1137,27 @@ static ssize_t size_store(struct device *dev, struct device_attribute *attr, return -EINVAL; } - device_lock(dax_region->dev); + rc = down_write_killable(&dax_region_rwsem); + if (rc) + return rc; if (!dax_region->dev->driver) { - device_unlock(dax_region->dev); - return -ENXIO; + rc = -ENXIO; + goto err_region; } - device_lock(dev); + rc = down_write_killable(&dax_dev_rwsem); + if (rc) + goto err_dev; + rc = dev_dax_resize(dax_region, dev_dax, val); - device_unlock(dev); - device_unlock(dax_region->dev); - return rc == 0 ? len : rc; +err_dev: + up_write(&dax_dev_rwsem); +err_region: + up_write(&dax_region_rwsem); + + if (rc == 0) + return len; + return rc; } static DEVICE_ATTR_RW(size); @@ -1138,18 +1205,24 @@ static ssize_t mapping_store(struct device *dev, struct device_attribute *attr, return rc; rc = -ENXIO; - device_lock(dax_region->dev); + rc = down_write_killable(&dax_region_rwsem); + if (rc) + return rc; if (!dax_region->dev->driver) { - device_unlock(dax_region->dev); + up_write(&dax_region_rwsem); + return rc; + } + rc = down_write_killable(&dax_dev_rwsem); + if (rc) { + up_write(&dax_region_rwsem); return rc; } - device_lock(dev); to_alloc = range_len(&r); if (alloc_is_aligned(dev_dax, to_alloc)) rc = alloc_dev_dax_range(dev_dax, r.start, to_alloc); - device_unlock(dev); - device_unlock(dax_region->dev); + up_write(&dax_dev_rwsem); + up_write(&dax_region_rwsem); return rc == 0 ? len : rc; } @@ -1196,13 +1269,19 @@ static ssize_t align_store(struct device *dev, struct device_attribute *attr, if (!dax_align_valid(val)) return -EINVAL; - device_lock(dax_region->dev); + rc = down_write_killable(&dax_region_rwsem); + if (rc) + return rc; if (!dax_region->dev->driver) { - device_unlock(dax_region->dev); + up_write(&dax_region_rwsem); return -ENXIO; } - device_lock(dev); + rc = down_write_killable(&dax_dev_rwsem); + if (rc) { + up_write(&dax_region_rwsem); + return rc; + } if (dev->driver) { rc = -EBUSY; goto out_unlock; @@ -1214,8 +1293,8 @@ static ssize_t align_store(struct device *dev, struct device_attribute *attr, if (rc) dev_dax->align = align_save; out_unlock: - device_unlock(dev); - device_unlock(dax_region->dev); + up_write(&dax_dev_rwsem); + up_write(&dax_region_rwsem); return rc == 0 ? len : rc; } static DEVICE_ATTR_RW(align); @@ -1325,7 +1404,7 @@ static const struct device_type dev_dax_type = { .groups = dax_attribute_groups, }; -struct dev_dax *devm_create_dev_dax(struct dev_dax_data *data) +static struct dev_dax *__devm_create_dev_dax(struct dev_dax_data *data) { struct dax_region *dax_region = data->dax_region; struct device *parent = dax_region->dev; @@ -1440,6 +1519,21 @@ struct dev_dax *devm_create_dev_dax(struct dev_dax_data *data) return ERR_PTR(rc); } + +struct dev_dax *devm_create_dev_dax(struct dev_dax_data *data) +{ + struct dev_dax *dev_dax; + int rc; + + rc = down_write_killable(&dax_region_rwsem); + if (rc) + return ERR_PTR(rc); + + dev_dax = __devm_create_dev_dax(data); + up_write(&dax_region_rwsem); + + return dev_dax; +} EXPORT_SYMBOL_GPL(devm_create_dev_dax); int __dax_driver_register(struct dax_device_driver *dax_drv, From 1e0308bf31f3c788a922e49da69ec79d7684ba4d Mon Sep 17 00:00:00 2001 From: Vishal Verma Date: Wed, 24 Jan 2024 12:03:47 -0800 Subject: [PATCH 493/707] dax/bus.c: replace several sprintf() with sysfs_emit() There were several places where drivers/dax/bus.c uses 'sprintf' to print sysfs data. Since a sysfs_emit() helper is available specifically for this purpose, replace all the sprintf() usage for sysfs with sysfs_emit() in this file. Link: https://lkml.kernel.org/r/20240124-vv-dax_abi-v7-2-20d16cb8d23d@intel.com Signed-off-by: Vishal Verma Reported-by: Greg Kroah-Hartman Reviewed-by: Alison Schofield Cc: Dan Williams Cc: Dave Hansen Cc: Dave Jiang Cc: David Hildenbrand Cc: Huang Ying Cc: Jonathan Cameron Cc: Li Zhijian Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Oscar Salvador Signed-off-by: Andrew Morton --- drivers/dax/bus.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c index cb148f74ceda67..0fd948a4443e38 100644 --- a/drivers/dax/bus.c +++ b/drivers/dax/bus.c @@ -269,7 +269,7 @@ static ssize_t id_show(struct device *dev, { struct dax_region *dax_region = dev_get_drvdata(dev); - return sprintf(buf, "%d\n", dax_region->id); + return sysfs_emit(buf, "%d\n", dax_region->id); } static DEVICE_ATTR_RO(id); @@ -278,8 +278,8 @@ static ssize_t region_size_show(struct device *dev, { struct dax_region *dax_region = dev_get_drvdata(dev); - return sprintf(buf, "%llu\n", (unsigned long long) - resource_size(&dax_region->res)); + return sysfs_emit(buf, "%llu\n", + (unsigned long long)resource_size(&dax_region->res)); } static struct device_attribute dev_attr_region_size = __ATTR(size, 0444, region_size_show, NULL); @@ -289,7 +289,7 @@ static ssize_t region_align_show(struct device *dev, { struct dax_region *dax_region = dev_get_drvdata(dev); - return sprintf(buf, "%u\n", dax_region->align); + return sysfs_emit(buf, "%u\n", dax_region->align); } static struct device_attribute dev_attr_region_align = __ATTR(align, 0400, region_align_show, NULL); @@ -322,7 +322,7 @@ static ssize_t available_size_show(struct device *dev, size = dax_region_avail_size(dax_region); up_read(&dax_region_rwsem); - return sprintf(buf, "%llu\n", size); + return sysfs_emit(buf, "%llu\n", size); } static DEVICE_ATTR_RO(available_size); @@ -340,7 +340,7 @@ static ssize_t seed_show(struct device *dev, if (rc) return rc; seed = dax_region->seed; - rc = sprintf(buf, "%s\n", seed ? dev_name(seed) : ""); + rc = sysfs_emit(buf, "%s\n", seed ? dev_name(seed) : ""); up_read(&dax_region_rwsem); return rc; @@ -361,7 +361,7 @@ static ssize_t create_show(struct device *dev, if (rc) return rc; youngest = dax_region->youngest; - rc = sprintf(buf, "%s\n", youngest ? dev_name(youngest) : ""); + rc = sysfs_emit(buf, "%s\n", youngest ? dev_name(youngest) : ""); up_read(&dax_region_rwsem); return rc; @@ -763,7 +763,7 @@ static ssize_t start_show(struct device *dev, dax_range = get_dax_range(dev); if (!dax_range) return -ENXIO; - rc = sprintf(buf, "%#llx\n", dax_range->range.start); + rc = sysfs_emit(buf, "%#llx\n", dax_range->range.start); put_dax_range(); return rc; @@ -779,7 +779,7 @@ static ssize_t end_show(struct device *dev, dax_range = get_dax_range(dev); if (!dax_range) return -ENXIO; - rc = sprintf(buf, "%#llx\n", dax_range->range.end); + rc = sysfs_emit(buf, "%#llx\n", dax_range->range.end); put_dax_range(); return rc; @@ -795,7 +795,7 @@ static ssize_t pgoff_show(struct device *dev, dax_range = get_dax_range(dev); if (!dax_range) return -ENXIO; - rc = sprintf(buf, "%#lx\n", dax_range->pgoff); + rc = sysfs_emit(buf, "%#lx\n", dax_range->pgoff); put_dax_range(); return rc; @@ -969,7 +969,7 @@ static ssize_t size_show(struct device *dev, size = dev_dax_size(dev_dax); up_write(&dax_dev_rwsem); - return sprintf(buf, "%llu\n", size); + return sysfs_emit(buf, "%llu\n", size); } static bool alloc_is_aligned(struct dev_dax *dev_dax, resource_size_t size) @@ -1233,7 +1233,7 @@ static ssize_t align_show(struct device *dev, { struct dev_dax *dev_dax = to_dev_dax(dev); - return sprintf(buf, "%d\n", dev_dax->align); + return sysfs_emit(buf, "%d\n", dev_dax->align); } static ssize_t dev_dax_validate_align(struct dev_dax *dev_dax) @@ -1311,7 +1311,7 @@ static ssize_t target_node_show(struct device *dev, { struct dev_dax *dev_dax = to_dev_dax(dev); - return sprintf(buf, "%d\n", dev_dax_target_node(dev_dax)); + return sysfs_emit(buf, "%d\n", dev_dax_target_node(dev_dax)); } static DEVICE_ATTR_RO(target_node); @@ -1327,7 +1327,7 @@ static ssize_t resource_show(struct device *dev, else start = dev_dax->ranges[0].range.start; - return sprintf(buf, "%#llx\n", start); + return sysfs_emit(buf, "%#llx\n", start); } static DEVICE_ATTR(resource, 0400, resource_show, NULL); @@ -1338,14 +1338,14 @@ static ssize_t modalias_show(struct device *dev, struct device_attribute *attr, * We only ever expect to handle device-dax instances, i.e. the * @type argument to MODULE_ALIAS_DAX_DEVICE() is always zero */ - return sprintf(buf, DAX_DEVICE_MODALIAS_FMT "\n", 0); + return sysfs_emit(buf, DAX_DEVICE_MODALIAS_FMT "\n", 0); } static DEVICE_ATTR_RO(modalias); static ssize_t numa_node_show(struct device *dev, struct device_attribute *attr, char *buf) { - return sprintf(buf, "%d\n", dev_to_node(dev)); + return sysfs_emit(buf, "%d\n", dev_to_node(dev)); } static DEVICE_ATTR_RO(numa_node); From 7c04d26dcbe7be05c740796b5bc1f513937b6cd9 Mon Sep 17 00:00:00 2001 From: Vishal Verma Date: Wed, 24 Jan 2024 12:03:48 -0800 Subject: [PATCH 494/707] Documentatiion/ABI: add ABI documentation for sys-bus-dax Add the missing sysfs ABI documentation for the device DAX subsystem. Various ABI attributes under this have been present since v5.1, and more have been added over time. In preparation for adding a new attribute, add this file with the historical details. Link: https://lkml.kernel.org/r/20240124-vv-dax_abi-v7-3-20d16cb8d23d@intel.com Signed-off-by: Vishal Verma Cc: Dan Williams Cc: Dave Hansen Cc: Dave Jiang Cc: David Hildenbrand Cc: Greg Kroah-Hartman Cc: Huang Ying Cc: Jonathan Cameron Cc: Li Zhijian Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Oscar Salvador Signed-off-by: Andrew Morton --- Documentation/ABI/testing/sysfs-bus-dax | 136 ++++++++++++++++++++++++ 1 file changed, 136 insertions(+) create mode 100644 Documentation/ABI/testing/sysfs-bus-dax diff --git a/Documentation/ABI/testing/sysfs-bus-dax b/Documentation/ABI/testing/sysfs-bus-dax new file mode 100644 index 00000000000000..6359f7bc9bf430 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-bus-dax @@ -0,0 +1,136 @@ +What: /sys/bus/dax/devices/daxX.Y/align +Date: October, 2020 +KernelVersion: v5.10 +Contact: nvdimm@lists.linux.dev +Description: + (RW) Provides a way to specify an alignment for a dax device. + Values allowed are constrained by the physical address ranges + that back the dax device, and also by arch requirements. + +What: /sys/bus/dax/devices/daxX.Y/mapping +Date: October, 2020 +KernelVersion: v5.10 +Contact: nvdimm@lists.linux.dev +Description: + (WO) Provides a way to allocate a mapping range under a dax + device. Specified in the format -. + +What: /sys/bus/dax/devices/daxX.Y/mapping[0..N]/start +What: /sys/bus/dax/devices/daxX.Y/mapping[0..N]/end +What: /sys/bus/dax/devices/daxX.Y/mapping[0..N]/page_offset +Date: October, 2020 +KernelVersion: v5.10 +Contact: nvdimm@lists.linux.dev +Description: + (RO) A dax device may have multiple constituent discontiguous + address ranges. These are represented by the different + 'mappingX' subdirectories. The 'start' attribute indicates the + start physical address for the given range. The 'end' attribute + indicates the end physical address for the given range. The + 'page_offset' attribute indicates the offset of the current + range in the dax device. + +What: /sys/bus/dax/devices/daxX.Y/resource +Date: June, 2019 +KernelVersion: v5.3 +Contact: nvdimm@lists.linux.dev +Description: + (RO) The resource attribute indicates the starting physical + address of a dax device. In case of a device with multiple + constituent ranges, it indicates the starting address of the + first range. + +What: /sys/bus/dax/devices/daxX.Y/size +Date: October, 2020 +KernelVersion: v5.10 +Contact: nvdimm@lists.linux.dev +Description: + (RW) The size attribute indicates the total size of a dax + device. For creating subdivided dax devices, or for resizing + an existing device, the new size can be written to this as + part of the reconfiguration process. + +What: /sys/bus/dax/devices/daxX.Y/numa_node +Date: November, 2019 +KernelVersion: v5.5 +Contact: nvdimm@lists.linux.dev +Description: + (RO) If NUMA is enabled and the platform has affinitized the + backing device for this dax device, emit the CPU node + affinity for this device. + +What: /sys/bus/dax/devices/daxX.Y/target_node +Date: February, 2019 +KernelVersion: v5.1 +Contact: nvdimm@lists.linux.dev +Description: + (RO) The target-node attribute is the Linux numa-node that a + device-dax instance may create when it is online. Prior to + being online the device's 'numa_node' property reflects the + closest online cpu node which is the typical expectation of a + device 'numa_node'. Once it is online it becomes its own + distinct numa node. + +What: $(readlink -f /sys/bus/dax/devices/daxX.Y)/../dax_region/available_size +Date: October, 2020 +KernelVersion: v5.10 +Contact: nvdimm@lists.linux.dev +Description: + (RO) The available_size attribute tracks available dax region + capacity. This only applies to volatile hmem devices, not pmem + devices, since pmem devices are defined by nvdimm namespace + boundaries. + +What: $(readlink -f /sys/bus/dax/devices/daxX.Y)/../dax_region/size +Date: July, 2017 +KernelVersion: v5.1 +Contact: nvdimm@lists.linux.dev +Description: + (RO) The size attribute indicates the size of a given dax region + in bytes. + +What: $(readlink -f /sys/bus/dax/devices/daxX.Y)/../dax_region/align +Date: October, 2020 +KernelVersion: v5.10 +Contact: nvdimm@lists.linux.dev +Description: + (RO) The align attribute indicates alignment of the dax region. + Changes on align may not always be valid, when say certain + mappings were created with 2M and then we switch to 1G. This + validates all ranges against the new value being attempted, post + resizing. + +What: $(readlink -f /sys/bus/dax/devices/daxX.Y)/../dax_region/seed +Date: October, 2020 +KernelVersion: v5.10 +Contact: nvdimm@lists.linux.dev +Description: + (RO) The seed device is a concept for dynamic dax regions to be + able to split the region amongst multiple sub-instances. The + seed device, similar to libnvdimm seed devices, is a device + that starts with zero capacity allocated and unbound to a + driver. + +What: $(readlink -f /sys/bus/dax/devices/daxX.Y)/../dax_region/create +Date: October, 2020 +KernelVersion: v5.10 +Contact: nvdimm@lists.linux.dev +Description: + (RW) The create interface to the dax region provides a way to + create a new unconfigured dax device under the given region, which + can then be configured (with a size etc.) and then probed. + +What: $(readlink -f /sys/bus/dax/devices/daxX.Y)/../dax_region/delete +Date: October, 2020 +KernelVersion: v5.10 +Contact: nvdimm@lists.linux.dev +Description: + (WO) The delete interface for a dax region provides for deletion + of any 0-sized and idle dax devices. + +What: $(readlink -f /sys/bus/dax/devices/daxX.Y)/../dax_region/id +Date: July, 2017 +KernelVersion: v5.1 +Contact: nvdimm@lists.linux.dev +Description: + (RO) The id attribute indicates the region id of a dax region. From 2705e47b92409155c868c12f0db02e32ccbb32d3 Mon Sep 17 00:00:00 2001 From: Vishal Verma Date: Wed, 24 Jan 2024 12:03:49 -0800 Subject: [PATCH 495/707] mm/memory_hotplug: export mhp_supports_memmap_on_memory() In preparation for adding sysfs ABI to toggle memmap_on_memory semantics for drivers adding memory, export the mhp_supports_memmap_on_memory() helper. This allows drivers to check if memmap_on_memory support is available before trying to request it, and display an appropriate message if it isn't available. As part of this, remove the size argument to this - with recent updates to allow memmap_on_memory for larger ranges, and the internal splitting of altmaps into respective memory blocks, the size argument is meaningless. Link: https://lkml.kernel.org/r/20240124-vv-dax_abi-v7-4-20d16cb8d23d@intel.com Signed-off-by: Vishal Verma Acked-by: David Hildenbrand Suggested-by: David Hildenbrand Cc: Greg Kroah-Hartman Cc: Jonathan Cameron Cc: Li Zhijian Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Oscar Salvador Cc: Dan Williams Cc: Dave Jiang Cc: Dave Hansen Cc: Huang Ying Signed-off-by: Andrew Morton --- include/linux/memory_hotplug.h | 6 ++++++ mm/memory_hotplug.c | 17 ++++++----------- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index ee00015575aab3..70aadb2009a08c 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -137,6 +137,7 @@ struct mhp_params { bool mhp_range_allowed(u64 start, u64 size, bool need_mapping); struct range mhp_get_pluggable_range(bool need_mapping); +bool mhp_supports_memmap_on_memory(void); /* * Zone resizing functions @@ -278,6 +279,11 @@ static inline bool movable_node_is_enabled(void) return false; } +static bool mhp_supports_memmap_on_memory(void) +{ + return false; +} + static inline void pgdat_kswapd_lock(pg_data_t *pgdat) {} static inline void pgdat_kswapd_unlock(pg_data_t *pgdat) {} static inline void pgdat_kswapd_lock_init(pg_data_t *pgdat) {} diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 707027f691503f..a444e2d7dd2bff 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1337,7 +1337,7 @@ static inline bool arch_supports_memmap_on_memory(unsigned long vmemmap_size) } #endif -static bool mhp_supports_memmap_on_memory(unsigned long size) +bool mhp_supports_memmap_on_memory(void) { unsigned long vmemmap_size = memory_block_memmap_size(); unsigned long memmap_pages = memory_block_memmap_on_memory_pages(); @@ -1346,17 +1346,11 @@ static bool mhp_supports_memmap_on_memory(unsigned long size) * Besides having arch support and the feature enabled at runtime, we * need a few more assumptions to hold true: * - * a) We span a single memory block: memory onlining/offlinin;g happens - * in memory block granularity. We don't want the vmemmap of online - * memory blocks to reside on offline memory blocks. In the future, - * we might want to support variable-sized memory blocks to make the - * feature more versatile. - * - * b) The vmemmap pages span complete PMDs: We don't want vmemmap code + * a) The vmemmap pages span complete PMDs: We don't want vmemmap code * to populate memory from the altmap for unrelated parts (i.e., * other memory blocks) * - * c) The vmemmap pages (and thereby the pages that will be exposed to + * b) The vmemmap pages (and thereby the pages that will be exposed to * the buddy) have to cover full pageblocks: memory onlining/offlining * code requires applicable ranges to be page-aligned, for example, to * set the migratetypes properly. @@ -1368,7 +1362,7 @@ static bool mhp_supports_memmap_on_memory(unsigned long size) * altmap as an alternative source of memory, and we do not exactly * populate a single PMD. */ - if (!mhp_memmap_on_memory() || size != memory_block_size_bytes()) + if (!mhp_memmap_on_memory()) return false; /* @@ -1391,6 +1385,7 @@ static bool mhp_supports_memmap_on_memory(unsigned long size) return arch_supports_memmap_on_memory(vmemmap_size); } +EXPORT_SYMBOL_GPL(mhp_supports_memmap_on_memory); static void __ref remove_memory_blocks_and_altmaps(u64 start, u64 size) { @@ -1526,7 +1521,7 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) * Self hosted memmap array */ if ((mhp_flags & MHP_MEMMAP_ON_MEMORY) && - mhp_supports_memmap_on_memory(memory_block_size_bytes())) { + mhp_supports_memmap_on_memory()) { ret = create_altmaps_and_memory_blocks(nid, group, start, size, mhp_flags); if (ret) goto error; From 7e70ae812236a784564f59320ee7e94a5e5b5caf Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sat, 27 Jan 2024 19:13:46 -0800 Subject: [PATCH 496/707] mm-memory_hotplug-export-mhp_supports_memmap_on_memory-fix fix build In file included from ./include/linux/mmzone.h:1425, from ./include/linux/gfp.h:7, from ./include/linux/slab.h:16, from ./include/linux/crypto.h:17, from arch/x86/kernel/asm-offsets.c:9: ./include/linux/memory_hotplug.h:282:13: warning: 'mhp_supports_memmap_on_memory' defined but not used [-Wunused-function] 282 | static bool mhp_supports_memmap_on_memory(void) | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Cc: Vishal Verma Signed-off-by: Andrew Morton --- include/linux/memory_hotplug.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 70aadb2009a08c..7a9ff464608d70 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -279,7 +279,7 @@ static inline bool movable_node_is_enabled(void) return false; } -static bool mhp_supports_memmap_on_memory(void) +static inline bool mhp_supports_memmap_on_memory(void) { return false; } From 6e024d502dd3631dfee6997d970ca51be60769d1 Mon Sep 17 00:00:00 2001 From: Vishal Verma Date: Wed, 24 Jan 2024 12:03:50 -0800 Subject: [PATCH 497/707] dax: add a sysfs knob to control memmap_on_memory behavior Add a sysfs knob for dax devices to control the memmap_on_memory setting if the dax device were to be hotplugged as system memory. The default memmap_on_memory setting for dax devices originating via pmem or hmem is set to 'false' - i.e. no memmap_on_memory semantics, to preserve legacy behavior. For dax devices via CXL, the default is on. The sysfs control allows the administrator to override the above defaults if needed. Link: https://lkml.kernel.org/r/20240124-vv-dax_abi-v7-5-20d16cb8d23d@intel.com Signed-off-by: Vishal Verma Tested-by: Li Zhijian Reviewed-by: Jonathan Cameron Reviewed-by: David Hildenbrand Reviewed-by: Huang, Ying Reviewed-by: Alison Schofield Cc: Dan Williams Cc: Dave Jiang Cc: Dave Hansen Cc: Greg Kroah-Hartman Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Oscar Salvador Signed-off-by: Andrew Morton --- Documentation/ABI/testing/sysfs-bus-dax | 17 ++++++++++ drivers/dax/bus.c | 43 +++++++++++++++++++++++++ 2 files changed, 60 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-bus-dax b/Documentation/ABI/testing/sysfs-bus-dax index 6359f7bc9bf430..b34266bfae49ae 100644 --- a/Documentation/ABI/testing/sysfs-bus-dax +++ b/Documentation/ABI/testing/sysfs-bus-dax @@ -134,3 +134,20 @@ KernelVersion: v5.1 Contact: nvdimm@lists.linux.dev Description: (RO) The id attribute indicates the region id of a dax region. + +What: /sys/bus/dax/devices/daxX.Y/memmap_on_memory +Date: January, 2024 +KernelVersion: v6.8 +Contact: nvdimm@lists.linux.dev +Description: + (RW) Control the memmap_on_memory setting if the dax device + were to be hotplugged as system memory. This determines whether + the 'altmap' for the hotplugged memory will be placed on the + device being hotplugged (memmap_on_memory=1) or if it will be + placed on regular memory (memmap_on_memory=0). This attribute + must be set before the device is handed over to the 'kmem' + driver (i.e. hotplugged into system-ram). Additionally, this + depends on CONFIG_MHP_MEMMAP_ON_MEMORY, and a globally enabled + memmap_on_memory parameter for memory_hotplug. This is + typically set on the kernel command line - + memory_hotplug.memmap_on_memory set to 'true' or 'force'." diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c index 0fd948a4443e38..27c86d0ca7118d 100644 --- a/drivers/dax/bus.c +++ b/drivers/dax/bus.c @@ -1349,6 +1349,48 @@ static ssize_t numa_node_show(struct device *dev, } static DEVICE_ATTR_RO(numa_node); +static ssize_t memmap_on_memory_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct dev_dax *dev_dax = to_dev_dax(dev); + + return sysfs_emit(buf, "%d\n", dev_dax->memmap_on_memory); +} + +static ssize_t memmap_on_memory_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t len) +{ + struct dev_dax *dev_dax = to_dev_dax(dev); + bool val; + int rc; + + rc = kstrtobool(buf, &val); + if (rc) + return rc; + + if (val == true && !mhp_supports_memmap_on_memory()) { + dev_dbg(dev, "memmap_on_memory is not available\n"); + return -EOPNOTSUPP; + } + + rc = down_write_killable(&dax_dev_rwsem); + if (rc) + return rc; + + if (dev_dax->memmap_on_memory != val && dev->driver && + to_dax_drv(dev->driver)->type == DAXDRV_KMEM_TYPE) { + up_write(&dax_dev_rwsem); + return -EBUSY; + } + + dev_dax->memmap_on_memory = val; + up_write(&dax_dev_rwsem); + + return len; +} +static DEVICE_ATTR_RW(memmap_on_memory); + static umode_t dev_dax_visible(struct kobject *kobj, struct attribute *a, int n) { struct device *dev = container_of(kobj, struct device, kobj); @@ -1375,6 +1417,7 @@ static struct attribute *dev_dax_attributes[] = { &dev_attr_align.attr, &dev_attr_resource.attr, &dev_attr_numa_node.attr, + &dev_attr_memmap_on_memory.attr, NULL, }; From 056484c1a19b104991c5276ddfb1841c44f76d04 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 24 Jan 2024 18:12:15 +0000 Subject: [PATCH 498/707] highmem: add kernel-doc for memcpy_*_folio() This was inadvertently skipped when adding the new functions. Link: https://lkml.kernel.org/r/20240124181217.1761674-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/highmem.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 451c1dff0e873c..00341b56d2910d 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -439,6 +439,13 @@ static inline void memzero_page(struct page *page, size_t offset, size_t len) kunmap_local(addr); } +/** + * memcpy_from_folio - Copy a range of bytes from a folio. + * @to: The memory to copy to. + * @folio: The folio to read from. + * @offset: The first byte in the folio to read. + * @len: The number of bytes to copy. + */ static inline void memcpy_from_folio(char *to, struct folio *folio, size_t offset, size_t len) { @@ -460,6 +467,13 @@ static inline void memcpy_from_folio(char *to, struct folio *folio, } while (len > 0); } +/** + * memcpy_to_folio - Copy a range of bytes to a folio. + * @folio: The folio to write to. + * @offset: The first byte in the folio to store to. + * @from: The memory to copy from. + * @len: The number of bytes to copy. + */ static inline void memcpy_to_folio(struct folio *folio, size_t offset, const char *from, size_t len) { From bd8ef4803f72d2ace7cadcb5a21474d8caf1a079 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Wed, 24 Jan 2024 18:31:34 +0100 Subject: [PATCH 499/707] mm: kmsan: remove runtime checks from kmsan_unpoison_memory() Similarly to what's been done in commit 85716a80c16d ("kmsan: allow using __msan_instrument_asm_store() inside runtime"), it should be safe to call kmsan_unpoison_memory() from within the runtime, as it does not allocate memory or take locks. Remove the redundant runtime checks. This should fix false positives seen with CONFIG_DEBUG_LIST=y when the non-instrumented lib/stackdepot.c failed to unpoison the memory chunks later checked by the instrumented lib/list_debug.c Also replace the implementation of kmsan_unpoison_entry_regs() with a call to kmsan_unpoison_memory(). Link: https://lkml.kernel.org/r/20240124173134.1165747-1-glider@google.com Fixes: f80be4571b19b9 ("kmsan: add KMSAN runtime core") Signed-off-by: Alexander Potapenko Tested-by: Marco Elver Cc: Dmitry Vyukov Cc: Ilya Leoshkevich Cc: Nicholas Miehlbradt Signed-off-by: Andrew Morton --- mm/kmsan/hooks.c | 36 +++++++++++++----------------------- 1 file changed, 13 insertions(+), 23 deletions(-) diff --git a/mm/kmsan/hooks.c b/mm/kmsan/hooks.c index 5d6e2dee5692a3..0b09daa188ef6c 100644 --- a/mm/kmsan/hooks.c +++ b/mm/kmsan/hooks.c @@ -359,6 +359,12 @@ void kmsan_handle_dma_sg(struct scatterlist *sg, int nents, } /* Functions from kmsan-checks.h follow. */ + +/* + * To create an origin, kmsan_poison_memory() unwinds the stacks and stores it + * into the stack depot. This may cause deadlocks if done from within KMSAN + * runtime, therefore we bail out if kmsan_in_runtime(). + */ void kmsan_poison_memory(const void *address, size_t size, gfp_t flags) { if (!kmsan_enabled || kmsan_in_runtime()) @@ -371,47 +377,31 @@ void kmsan_poison_memory(const void *address, size_t size, gfp_t flags) } EXPORT_SYMBOL(kmsan_poison_memory); +/* + * Unlike kmsan_poison_memory(), this function can be used from within KMSAN + * runtime, because it does not trigger allocations or call instrumented code. + */ void kmsan_unpoison_memory(const void *address, size_t size) { unsigned long ua_flags; - if (!kmsan_enabled || kmsan_in_runtime()) + if (!kmsan_enabled) return; ua_flags = user_access_save(); - kmsan_enter_runtime(); /* The users may want to poison/unpoison random memory. */ kmsan_internal_unpoison_memory((void *)address, size, KMSAN_POISON_NOCHECK); - kmsan_leave_runtime(); user_access_restore(ua_flags); } EXPORT_SYMBOL(kmsan_unpoison_memory); /* - * Version of kmsan_unpoison_memory() that can be called from within the KMSAN - * runtime. - * - * Non-instrumented IRQ entry functions receive struct pt_regs from assembly - * code. Those regs need to be unpoisoned, otherwise using them will result in - * false positives. - * Using kmsan_unpoison_memory() is not an option in entry code, because the - * return value of in_task() is inconsistent - as a result, certain calls to - * kmsan_unpoison_memory() are ignored. kmsan_unpoison_entry_regs() ensures that - * the registers are unpoisoned even if kmsan_in_runtime() is true in the early - * entry code. + * Version of kmsan_unpoison_memory() called from IRQ entry functions. */ void kmsan_unpoison_entry_regs(const struct pt_regs *regs) { - unsigned long ua_flags; - - if (!kmsan_enabled) - return; - - ua_flags = user_access_save(); - kmsan_internal_unpoison_memory((void *)regs, sizeof(*regs), - KMSAN_POISON_NOCHECK); - user_access_restore(ua_flags); + kmsan_unpoison_memory((void *)regs, sizeof(*regs)); } void kmsan_check_memory(const void *addr, size_t size) From 02ebb4652e3749bb4b139aaaebf593ca836d5e71 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Mon, 22 Jan 2024 22:46:33 -0500 Subject: [PATCH 500/707] mm/compaction: enable compacting >0 order folios. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "Enable >0 order folio memory compaction", v2. This patchset enables >0 order folio memory compaction, which is one of the prerequisitions for large folio support[1]. I am aware that split free pages is necessary for folio migration in compaction, since if >0 order free pages are never split and no order-0 free page is scanned, compaction will end prematurely due to migration returns -ENOMEM. Free page split becomes a must instead of an optimization. Some applications from vm-scalability show different performance trends on default LRU and CONFIG_LRU_GEN from patch 1 (split folio during compaction), to patch 2 (folio migration during compaction), to patch 3 (folio migration during compaction with free page split). I am looking into it. lkp ncompare results (with >5% delta) for default LRU and CONFIG_LRU_GEN are shown at the bottom (on a 8-CPU (Intel Xeon E5-2650 v4 @ 2.20GHz) 16G VM). Overview ======== To support >0 order folio compaction, the patchset changes how free pages used for migration are kept during compaction. Free pages used to be split into order-0 pages that are post allocation processed (i.e., PageBuddy flag cleared, page order stored in page->private is zeroed, and page reference is set to 1). Now all free pages are kept in a MAX_ORDER+1 array of page lists based on their order without post allocation process. When migrate_pages() asks for a new page, one of the free pages, based on the requested page order, is then processed and given out. [1] https://lore.kernel.org/linux-mm/20230912162815.440749-1-zi.yan@sent.com/ [2] https://lore.kernel.org/linux-mm/20231113170157.280181-1-zi.yan@sent.com/ vm-scalability results on CONFIG_LRU_GEN === ========================================================================================= compiler/kconfig/rootfs/runtime/tbox_group/test/testcase: gcc-13/defconfig/debian/300s/qemu-vm/small-allocs/vm-scalability commit: 6.7.0-rc4+ 6.7.0-rc4-split-folio-in-compaction+ 6.7.0-rc4-folio-migration-in-compaction+ 6.7.0-rc4-folio-migration-free-page-split+ 6.7.0-rc4+ 6.7.0-rc4-split-folio-in-co 6.7.0-rc4-folio-migration-i 6.7.0-rc4-folio-migration-f ---------------- --------------------------- --------------------------- --------------------------- %stddev %change %stddev %change %stddev %change %stddev \ | \ | \ | \ 2024326 +35.5% 2743772 ± 41% +364.0% 9392198 ± 35% +31.0% 2651634 vm-scalability.throughput ========================================================================================= compiler/kconfig/rootfs/runtime/tbox_group/test/testcase: gcc-13/defconfig/debian/300s/qemu-vm/small-allocs-mt/vm-scalability commit: 6.7.0-rc4+ 6.7.0-rc4-split-folio-in-compaction+ 6.7.0-rc4-folio-migration-in-compaction+ 6.7.0-rc4-folio-migration-free-page-split+ 6.7.0-rc4+ 6.7.0-rc4-split-folio-in-co 6.7.0-rc4-folio-migration-i 6.7.0-rc4-folio-migration-f ---------------- --------------------------- --------------------------- --------------------------- %stddev %change %stddev %change %stddev %change %stddev \ | \ | \ | \ 1450189 +0.9% 1463418 +30.4% 1891610 ± 22% +0.3% 1454100 vm-scalability.throughput ========================================================================================= compiler/kconfig/rootfs/runtime/tbox_group/test/testcase: gcc-13/defconfig/debian/300s/qemu-vm/mmap-xread-seq-mt/vm-scalability commit: 6.7.0-rc4+ 6.7.0-rc4-split-folio-in-compaction+ 6.7.0-rc4-folio-migration-in-compaction+ 6.7.0-rc4-folio-migration-free-page-split+ 6.7.0-rc4+ 6.7.0-rc4-split-folio-in-co 6.7.0-rc4-folio-migration-i 6.7.0-rc4-folio-migration-f ---------------- --------------------------- --------------------------- --------------------------- %stddev %change %stddev %change %stddev %change %stddev \ | \ | \ | \ 14428848 ± 27% -51.7% 6963308 ± 73% +13.5% 16372621 +11.2% 16046511 vm-scalability.throughput ========================================================================================= compiler/kconfig/rootfs/runtime/tbox_group/test/testcase: gcc-13/defconfig/debian/300s/qemu-vm/mmap-pread-seq/vm-scalability commit: 6.7.0-rc4+ 6.7.0-rc4-split-folio-in-compaction+ 6.7.0-rc4-folio-migration-in-compaction+ 6.7.0-rc4-folio-migration-free-page-split+ 6.7.0-rc4+ 6.7.0-rc4-split-folio-in-co 6.7.0-rc4-folio-migration-i 6.7.0-rc4-folio-migration-f ---------------- --------------------------- --------------------------- --------------------------- %stddev %change %stddev %change %stddev %change %stddev \ | \ | \ | \ 13569502 ± 24% -45.9% 7340064 ± 59% +12.3% 15240531 +10.4% 14983705 vm-scalability.throughput ========================================================================================= compiler/kconfig/rootfs/runtime/tbox_group/test/testcase: gcc-13/defconfig/debian/300s/qemu-vm/mmap-pread-seq-mt/vm-scalability commit: 6.7.0-rc4+ 6.7.0-rc4-split-folio-in-compaction+ 6.7.0-rc4-folio-migration-in-compaction+ 6.7.0-rc4-folio-migration-free-page-split+ 6.7.0-rc4+ 6.7.0-rc4-split-folio-in-co 6.7.0-rc4-folio-migration-i 6.7.0-rc4-folio-migration-f ---------------- --------------------------- --------------------------- --------------------------- %stddev %change %stddev %change %stddev %change %stddev \ | \ | \ | \ 13305823 ± 24% -45.1% 7299664 ± 56% +12.5% 14974725 +10.4% 14695963 vm-scalability.throughput ========================================================================================= compiler/kconfig/rootfs/runtime/tbox_group/test/testcase: gcc-13/defconfig/debian/300s/qemu-vm/lru-file-readtwice/vm-scalability commit: 6.7.0-rc4+ 6.7.0-rc4-split-folio-in-compaction+ 6.7.0-rc4-folio-migration-in-compaction+ 6.7.0-rc4-folio-migration-free-page-split+ 6.7.0-rc4+ 6.7.0-rc4-split-folio-in-co 6.7.0-rc4-folio-migration-i 6.7.0-rc4-folio-migration-f ---------------- --------------------------- --------------------------- --------------------------- %stddev %change %stddev %change %stddev %change %stddev \ | \ | \ | \ 13244376 ± 28% +54.2% 20425838 ± 23% -4.4% 12660113 ± 3% -9.0% 12045809 ± 3% vm-scalability.throughput ========================================================================================= compiler/kconfig/rootfs/runtime/tbox_group/test/testcase: gcc-13/defconfig/debian/300s/qemu-vm/lru-file-mmap-read/vm-scalability commit: 6.7.0-rc4+ 6.7.0-rc4-split-folio-in-compaction+ 6.7.0-rc4-folio-migration-in-compaction+ 6.7.0-rc4-folio-migration-free-page-split+ 6.7.0-rc4+ 6.7.0-rc4-split-folio-in-co 6.7.0-rc4-folio-migration-i 6.7.0-rc4-folio-migration-f ---------------- --------------------------- --------------------------- --------------------------- %stddev %change %stddev %change %stddev %change %stddev \ | \ | \ | \ 7021425 ± 11% -20.9% 5556751 ± 19% +14.8% 8057811 ± 3% +9.4% 7678613 ± 4% vm-scalability.throughput ========================================================================================= compiler/kconfig/rootfs/runtime/size/tbox_group/test/testcase: gcc-13/defconfig/debian/300s/256G/qemu-vm/msync/vm-scalability commit: 6.7.0-rc4+ 6.7.0-rc4-split-folio-in-compaction+ 6.7.0-rc4-folio-migration-in-compaction+ 6.7.0-rc4-folio-migration-free-page-split+ 6.7.0-rc4+ 6.7.0-rc4-split-folio-in-co 6.7.0-rc4-folio-migration-i 6.7.0-rc4-folio-migration-f ---------------- --------------------------- --------------------------- --------------------------- %stddev %change %stddev %change %stddev %change %stddev \ | \ | \ | \ 1208994 ±137% +263.5% 4394683 ± 49% -49.4% 611204 ± 6% -48.1% 627937 ± 13% vm-scalability.throughput vm-scalability results on default LRU (with -no-mglru suffix) === ========================================================================================= compiler/kconfig/rootfs/runtime/tbox_group/test/testcase: gcc-13/defconfig/debian/300s/qemu-vm/lru-file-readtwice/vm-scalability commit: 6.7.0-rc4-no-mglru+ 6.7.0-rc4-split-folio-in-compaction-no-mglru+ 6.7.0-rc4-folio-migration-in-compaction-no-mglru+ 6.7.0-rc4-folio-migration-free-page-split-no-mglru+ 6.7.0-rc4-no-mgl 6.7.0-rc4-split-folio-in-co 6.7.0-rc4-folio-migration-i 6.7.0-rc4-folio-migration-f ---------------- --------------------------- --------------------------- --------------------------- %stddev %change %stddev %change %stddev %change %stddev \ | \ | \ | \ 8412072 ± 3% +32.1% 11114537 ± 41% +3.5% 8703491 ± 3% +1.5% 8536343 ± 3% vm-scalability.throughput ========================================================================================= compiler/kconfig/rootfs/runtime/tbox_group/test/testcase: gcc-13/defconfig/debian/300s/qemu-vm/lru-file-mmap-read/vm-scalability commit: 6.7.0-rc4-no-mglru+ 6.7.0-rc4-split-folio-in-compaction-no-mglru+ 6.7.0-rc4-folio-migration-in-compaction-no-mglru+ 6.7.0-rc4-folio-migration-free-page-split-no-mglru+ 6.7.0-rc4-no-mgl 6.7.0-rc4-split-folio-in-co 6.7.0-rc4-folio-migration-i 6.7.0-rc4-folio-migration-f ---------------- --------------------------- --------------------------- --------------------------- %stddev %change %stddev %change %stddev %change %stddev \ | \ | \ | \ 7095358 +10.8% 7863635 ± 16% +5.5% 7484110 +1.5% 7200666 ± 4% vm-scalability.throughput This patch (of 3): migrate_pages() supports >0 order folio migration and during compaction, even if compaction_alloc() cannot provide >0 order free pages, migrate_pages() can split the source page and try to migrate the base pages from the split. It can be a baseline and start point for adding support for compacting >0 order folios. Link: https://lkml.kernel.org/r/20240123034636.1095672-1-zi.yan@sent.com Link: https://lkml.kernel.org/r/20240123034636.1095672-2-zi.yan@sent.com Signed-off-by: Zi Yan Suggested-by: Huang Ying Reviewed-by: Baolin Wang Cc: Adam Manzanares Cc: David Hildenbrand Cc: Johannes Weiner Cc: Kemeng Shi Cc: Kirill A. Shutemov Cc: Luis Chamberlain Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Ryan Roberts Cc: Vishal Moola (Oracle) Cc: Vlastimil Babka Cc: Yin Fengwei Cc: Yu Zhao Signed-off-by: Andrew Morton --- mm/compaction.c | 43 +++++++++++++++++++++++++++++++++++-------- 1 file changed, 35 insertions(+), 8 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index 4add68d40e8d99..e43e898d2c77f9 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -816,6 +816,21 @@ static bool too_many_isolated(struct compact_control *cc) return too_many; } +/* + * 1. if the page order is larger than or equal to target_order (i.e., + * cc->order and when it is not -1 for global compaction), skip it since + * target_order already indicates no free page with larger than target_order + * exists and later migrating it will most likely fail; + * + * 2. compacting > pageblock_order pages does not improve memory fragmentation, + * skip them; + */ +static bool skip_isolation_on_order(int order, int target_order) +{ + return (target_order != -1 && order >= target_order) || + order >= pageblock_order; +} + /** * isolate_migratepages_block() - isolate all migrate-able pages within * a single pageblock @@ -1010,7 +1025,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, /* * Regardless of being on LRU, compound pages such as THP and * hugetlbfs are not to be compacted unless we are attempting - * an allocation much larger than the huge page size (eg CMA). + * an allocation larger than the compound page size. * We can potentially save a lot of iterations if we skip them * at once. The check is racy, but we can consider only valid * values and the only danger is skipping too much. @@ -1018,11 +1033,18 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, if (PageCompound(page) && !cc->alloc_contig) { const unsigned int order = compound_order(page); - if (likely(order <= MAX_PAGE_ORDER)) { - low_pfn += (1UL << order) - 1; - nr_scanned += (1UL << order) - 1; + /* + * Skip based on page order and compaction target order + * and skip hugetlbfs pages. + */ + if (skip_isolation_on_order(order, cc->order) || + PageHuge(page)) { + if (order <= MAX_PAGE_ORDER) { + low_pfn += (1UL << order) - 1; + nr_scanned += (1UL << order) - 1; + } + goto isolate_fail; } - goto isolate_fail; } /* @@ -1165,10 +1187,11 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, } /* - * folio become large since the non-locked check, - * and it's on LRU. + * Check LRU folio order under the lock */ - if (unlikely(folio_test_large(folio) && !cc->alloc_contig)) { + if (unlikely(skip_isolation_on_order(folio_order(folio), + cc->order) && + !cc->alloc_contig)) { low_pfn += folio_nr_pages(folio) - 1; nr_scanned += folio_nr_pages(folio) - 1; folio_set_lru(folio); @@ -1786,6 +1809,10 @@ static struct folio *compaction_alloc(struct folio *src, unsigned long data) struct compact_control *cc = (struct compact_control *)data; struct folio *dst; + /* this makes migrate_pages() split the source page and retry */ + if (folio_test_large(src) > 0) + return NULL; + if (list_empty(&cc->freepages)) { isolate_freepages(cc); From b6feb42e379bdb8da2567fdf3bb9fcb3cf84fe10 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Mon, 22 Jan 2024 22:46:34 -0500 Subject: [PATCH 501/707] mm/compaction: add support for >0 order folio memory compaction. Before last commit, memory compaction only migrates order-0 folios and skips >0 order folios. Last commit splits all >0 order folios during compaction. This commit migrates >0 order folios during compaction by keeping isolated free pages at their original size without splitting them into order-0 pages and using them directly during migration process. What is different from the prior implementation: 1. All isolated free pages are kept in a NR_PAGE_ORDERS array of page lists, where each page list stores free pages in the same order. 2. All free pages are not post_alloc_hook() processed nor buddy pages, although their orders are stored in first page's private like buddy pages. 3. During migration, in new page allocation time (i.e., in compaction_alloc()), free pages are then processed by post_alloc_hook(). When migration fails and a new page is returned (i.e., in compaction_free()), free pages are restored by reversing the post_alloc_hook() operations using newly added free_pages_prepare_fpi_none(). Step 3 is done for a latter optimization that splitting and/or merging free pages during compaction becomes easier. Note: without splitting free pages, compaction can end prematurely due to migration will return -ENOMEM even if there is free pages. This happens when no order-0 free page exist and compaction_alloc() return NULL. Link: https://lkml.kernel.org/r/20240123034636.1095672-3-zi.yan@sent.com Signed-off-by: Zi Yan Cc: Adam Manzanares Cc: Baolin Wang Cc: David Hildenbrand Cc: Huang Ying Cc: Johannes Weiner Cc: Kemeng Shi Cc: Kirill A. Shutemov Cc: Luis Chamberlain Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Ryan Roberts Cc: Vishal Moola (Oracle) Cc: Vlastimil Babka Cc: Yin Fengwei Cc: Yu Zhao Signed-off-by: Andrew Morton --- mm/compaction.c | 148 +++++++++++++++++++++++++++++------------------- mm/internal.h | 9 ++- mm/page_alloc.c | 6 ++ 3 files changed, 103 insertions(+), 60 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index e43e898d2c77f9..7465a24288c165 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -66,45 +66,67 @@ static inline void count_compact_events(enum vm_event_item item, long delta) #define COMPACTION_HPAGE_ORDER (PMD_SHIFT - PAGE_SHIFT) #endif -static unsigned long release_freepages(struct list_head *freelist) +static void init_page_list(struct page_list *p) { - struct page *page, *next; - unsigned long high_pfn = 0; - - list_for_each_entry_safe(page, next, freelist, lru) { - unsigned long pfn = page_to_pfn(page); - list_del(&page->lru); - __free_page(page); - if (pfn > high_pfn) - high_pfn = pfn; - } - - return high_pfn; + INIT_LIST_HEAD(&p->pages); + p->nr_pages = 0; } -static void split_map_pages(struct list_head *list) +static void split_map_pages(struct page_list *freepages) { - unsigned int i, order, nr_pages; + unsigned int i, order, total_nr_pages; struct page *page, *next; LIST_HEAD(tmp_list); - list_for_each_entry_safe(page, next, list, lru) { - list_del(&page->lru); + for (order = 0; order < NR_PAGE_ORDERS; order++) { + total_nr_pages = freepages[order].nr_pages * (1 << order); + freepages[order].nr_pages = 0; + + list_for_each_entry_safe(page, next, &freepages[order].pages, lru) { + unsigned int nr_pages; + + list_del(&page->lru); - order = page_private(page); - nr_pages = 1 << order; + nr_pages = 1 << order; - post_alloc_hook(page, order, __GFP_MOVABLE); - if (order) - split_page(page, order); + post_alloc_hook(page, order, __GFP_MOVABLE); + if (order) + split_page(page, order); - for (i = 0; i < nr_pages; i++) { - list_add(&page->lru, &tmp_list); - page++; + for (i = 0; i < nr_pages; i++) { + list_add(&page->lru, &tmp_list); + page++; + } } + freepages[0].nr_pages += total_nr_pages; + list_splice_init(&tmp_list, &freepages[0].pages); } +} - list_splice(&tmp_list, list); +static unsigned long release_free_list(struct page_list *freepages) +{ + int order; + unsigned long high_pfn = 0; + + for (order = 0; order < NR_PAGE_ORDERS; order++) { + struct page *page, *next; + + list_for_each_entry_safe(page, next, &freepages[order].pages, lru) { + unsigned long pfn = page_to_pfn(page); + + list_del(&page->lru); + /* + * Convert free pages into post allocation pages, so + * that we can free them via __free_page. + */ + post_alloc_hook(page, order, __GFP_MOVABLE); + __free_pages(page, order); + if (pfn > high_pfn) + high_pfn = pfn; + } + freepages[order].nr_pages = 0; + } + return high_pfn; } #ifdef CONFIG_COMPACTION @@ -583,7 +605,7 @@ static bool compact_unlock_should_abort(spinlock_t *lock, static unsigned long isolate_freepages_block(struct compact_control *cc, unsigned long *start_pfn, unsigned long end_pfn, - struct list_head *freelist, + struct page_list *freelist, unsigned int stride, bool strict) { @@ -657,7 +679,8 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, nr_scanned += isolated - 1; total_isolated += isolated; cc->nr_freepages += isolated; - list_add_tail(&page->lru, freelist); + list_add_tail(&page->lru, &freelist[order].pages); + freelist[order].nr_pages++; if (!strict && cc->nr_migratepages <= cc->nr_freepages) { blockpfn += isolated; @@ -722,7 +745,11 @@ isolate_freepages_range(struct compact_control *cc, unsigned long start_pfn, unsigned long end_pfn) { unsigned long isolated, pfn, block_start_pfn, block_end_pfn; - LIST_HEAD(freelist); + int order; + struct page_list tmp_freepages[NR_PAGE_ORDERS]; + + for (order = 0; order < NR_PAGE_ORDERS; order++) + init_page_list(&tmp_freepages[order]); pfn = start_pfn; block_start_pfn = pageblock_start_pfn(pfn); @@ -753,7 +780,7 @@ isolate_freepages_range(struct compact_control *cc, break; isolated = isolate_freepages_block(cc, &isolate_start_pfn, - block_end_pfn, &freelist, 0, true); + block_end_pfn, tmp_freepages, 0, true); /* * In strict mode, isolate_freepages_block() returns 0 if @@ -770,15 +797,15 @@ isolate_freepages_range(struct compact_control *cc, */ } - /* __isolate_free_page() does not map the pages */ - split_map_pages(&freelist); - if (pfn < end_pfn) { /* Loop terminated early, cleanup. */ - release_freepages(&freelist); + release_free_list(tmp_freepages); return 0; } + /* __isolate_free_page() does not map the pages */ + split_map_pages(tmp_freepages); + /* We don't use freelists for anything. */ return pfn; } @@ -1481,7 +1508,7 @@ fast_isolate_around(struct compact_control *cc, unsigned long pfn) if (!page) return; - isolate_freepages_block(cc, &start_pfn, end_pfn, &cc->freepages, 1, false); + isolate_freepages_block(cc, &start_pfn, end_pfn, cc->freepages, 1, false); /* Skip this pageblock in the future as it's full or nearly full */ if (start_pfn == end_pfn && !cc->no_set_skip_hint) @@ -1610,7 +1637,7 @@ static void fast_isolate_freepages(struct compact_control *cc) nr_scanned += nr_isolated - 1; total_isolated += nr_isolated; cc->nr_freepages += nr_isolated; - list_add_tail(&page->lru, &cc->freepages); + list_add_tail(&page->lru, &cc->freepages[order].pages); count_compact_events(COMPACTISOLATED, nr_isolated); } else { /* If isolation fails, abort the search */ @@ -1687,13 +1714,12 @@ static void isolate_freepages(struct compact_control *cc) unsigned long isolate_start_pfn; /* exact pfn we start at */ unsigned long block_end_pfn; /* end of current pageblock */ unsigned long low_pfn; /* lowest pfn scanner is able to scan */ - struct list_head *freelist = &cc->freepages; unsigned int stride; /* Try a small search of the free lists for a candidate */ fast_isolate_freepages(cc); if (cc->nr_freepages) - goto splitmap; + return; /* * Initialise the free scanner. The starting point is where we last @@ -1753,7 +1779,7 @@ static void isolate_freepages(struct compact_control *cc) /* Found a block suitable for isolating free pages from. */ nr_isolated = isolate_freepages_block(cc, &isolate_start_pfn, - block_end_pfn, freelist, stride, false); + block_end_pfn, cc->freepages, stride, false); /* Update the skip hint if the full pageblock was scanned */ if (isolate_start_pfn == block_end_pfn) @@ -1794,10 +1820,6 @@ static void isolate_freepages(struct compact_control *cc) * and the loop terminated due to isolate_start_pfn < low_pfn */ cc->free_pfn = isolate_start_pfn; - -splitmap: - /* __isolate_free_page() does not map the pages */ - split_map_pages(freelist); } /* @@ -1808,23 +1830,22 @@ static struct folio *compaction_alloc(struct folio *src, unsigned long data) { struct compact_control *cc = (struct compact_control *)data; struct folio *dst; + int order = folio_order(src); - /* this makes migrate_pages() split the source page and retry */ - if (folio_test_large(src) > 0) - return NULL; - - if (list_empty(&cc->freepages)) { + if (!cc->freepages[order].nr_pages) { isolate_freepages(cc); - - if (list_empty(&cc->freepages)) + if (!cc->freepages[order].nr_pages) return NULL; } - dst = list_entry(cc->freepages.next, struct folio, lru); + dst = list_first_entry(&cc->freepages[order].pages, struct folio, lru); + cc->freepages[order].nr_pages--; list_del(&dst->lru); - cc->nr_freepages--; - - return dst; + post_alloc_hook(&dst->page, order, __GFP_MOVABLE); + if (order) + prep_compound_page(&dst->page, order); + cc->nr_freepages -= 1 << order; + return page_rmappable_folio(&dst->page); } /* @@ -1835,9 +1856,17 @@ static struct folio *compaction_alloc(struct folio *src, unsigned long data) static void compaction_free(struct folio *dst, unsigned long data) { struct compact_control *cc = (struct compact_control *)data; + int order = folio_order(dst); + struct page *page = &dst->page; + + folio_set_count(dst, 0); + free_pages_prepare_fpi_none(page, order); - list_add(&dst->lru, &cc->freepages); - cc->nr_freepages++; + INIT_LIST_HEAD(&dst->lru); + + list_add(&dst->lru, &cc->freepages[order].pages); + cc->freepages[order].nr_pages++; + cc->nr_freepages += 1 << order; } /* possible outcome of isolate_migratepages */ @@ -2461,6 +2490,7 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) const bool sync = cc->mode != MIGRATE_ASYNC; bool update_cached; unsigned int nr_succeeded = 0; + int order; /* * These counters track activities during zone compaction. Initialize @@ -2470,7 +2500,8 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) cc->total_free_scanned = 0; cc->nr_migratepages = 0; cc->nr_freepages = 0; - INIT_LIST_HEAD(&cc->freepages); + for (order = 0; order < NR_PAGE_ORDERS; order++) + init_page_list(&cc->freepages[order]); INIT_LIST_HEAD(&cc->migratepages); cc->migratetype = gfp_migratetype(cc->gfp_mask); @@ -2656,7 +2687,7 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) * so we don't leave any returned pages behind in the next attempt. */ if (cc->nr_freepages > 0) { - unsigned long free_pfn = release_freepages(&cc->freepages); + unsigned long free_pfn = release_free_list(cc->freepages); cc->nr_freepages = 0; VM_BUG_ON(free_pfn == 0); @@ -2675,7 +2706,6 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) trace_mm_compaction_end(cc, start_pfn, end_pfn, sync, ret); - VM_BUG_ON(!list_empty(&cc->freepages)); VM_BUG_ON(!list_empty(&cc->migratepages)); return ret; diff --git a/mm/internal.h b/mm/internal.h index 1e29c5821a1dde..c6ea449c5353ce 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -447,6 +447,8 @@ extern void prep_compound_page(struct page *page, unsigned int order); extern void post_alloc_hook(struct page *page, unsigned int order, gfp_t gfp_flags); +extern bool free_pages_prepare_fpi_none(struct page *page, unsigned int order); + extern int user_min_free_kbytes; extern void free_unref_page(struct page *page, unsigned int order); @@ -473,6 +475,11 @@ int split_free_page(struct page *free_page, /* * in mm/compaction.c */ + +struct page_list { + struct list_head pages; + unsigned long nr_pages; +}; /* * compact_control is used to track pages being migrated and the free pages * they are being migrated to during memory compaction. The free_pfn starts @@ -481,7 +488,7 @@ int split_free_page(struct page *free_page, * completes when free_pfn <= migrate_pfn */ struct compact_control { - struct list_head freepages; /* List of free pages to migrate to */ + struct page_list freepages[NR_PAGE_ORDERS]; /* List of free pages to migrate to */ struct list_head migratepages; /* List of pages being migrated */ unsigned int nr_freepages; /* Number of isolated free pages */ unsigned int nr_migratepages; /* Number of pages to migrate */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 150d4f23b01048..aeda1153dad92b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1179,6 +1179,12 @@ static __always_inline bool free_pages_prepare(struct page *page, return true; } +__always_inline bool free_pages_prepare_fpi_none(struct page *page, + unsigned int order) +{ + return free_pages_prepare(page, order, FPI_NONE); +} + /* * Frees a number of pages from the PCP lists * Assumes all pages on list are in same zone. From f2ab40c4bc9c63f4feb9af6ec5ce534cfb6055d3 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Mon, 22 Jan 2024 22:46:35 -0500 Subject: [PATCH 502/707] mm/compaction: optimize >0 order folio compaction with free page split. During migration in a memory compaction, free pages are placed in an array of page lists based on their order. But the desired free page order (i.e., the order of a source page) might not be always present, thus leading to migration failures and premature compaction termination. Split a high order free pages when source migration page has a lower order to increase migration successful rate. Note: merging free pages when a migration fails and a lower order free page is returned via compaction_free() is possible, but there is too much work. Since the free pages are not buddy pages, it is hard to identify these free pages using existing PFN-based page merging algorithm. Link: https://lkml.kernel.org/r/20240123034636.1095672-4-zi.yan@sent.com Signed-off-by: Zi Yan Cc: Adam Manzanares Cc: Baolin Wang Cc: David Hildenbrand Cc: Huang Ying Cc: Johannes Weiner Cc: Kemeng Shi Cc: Kirill A. Shutemov Cc: Luis Chamberlain Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Ryan Roberts Cc: Vishal Moola (Oracle) Cc: Vlastimil Babka Cc: Yin Fengwei Cc: Yu Zhao Signed-off-by: Andrew Morton --- mm/compaction.c | 37 ++++++++++++++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/mm/compaction.c b/mm/compaction.c index 7465a24288c165..335a6f6787e4e3 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1831,9 +1831,43 @@ static struct folio *compaction_alloc(struct folio *src, unsigned long data) struct compact_control *cc = (struct compact_control *)data; struct folio *dst; int order = folio_order(src); + bool has_isolated_pages = false; +again: if (!cc->freepages[order].nr_pages) { - isolate_freepages(cc); + int i; + + for (i = order + 1; i < NR_PAGE_ORDERS; i++) { + if (cc->freepages[i].nr_pages) { + struct page *freepage = + list_first_entry(&cc->freepages[i].pages, + struct page, lru); + + int start_order = i; + unsigned long size = 1 << start_order; + + list_del(&freepage->lru); + cc->freepages[i].nr_pages--; + + while (start_order > order) { + start_order--; + size >>= 1; + + list_add(&freepage[size].lru, + &cc->freepages[start_order].pages); + cc->freepages[start_order].nr_pages++; + set_page_private(&freepage[size], start_order); + } + dst = (struct folio *)freepage; + goto done; + } + } + if (!has_isolated_pages) { + isolate_freepages(cc); + has_isolated_pages = true; + goto again; + } + if (!cc->freepages[order].nr_pages) return NULL; } @@ -1841,6 +1875,7 @@ static struct folio *compaction_alloc(struct folio *src, unsigned long data) dst = list_first_entry(&cc->freepages[order].pages, struct folio, lru); cc->freepages[order].nr_pages--; list_del(&dst->lru); +done: post_alloc_hook(&dst->page, order, __GFP_MOVABLE); if (order) prep_compound_page(&dst->page, order); From 0e20ae3975d9489c5e4a156ef19a0faee288c325 Mon Sep 17 00:00:00 2001 From: "T.J. Mercier" Date: Fri, 26 Jan 2024 21:19:25 +0000 Subject: [PATCH 503/707] mm: memcg: don't periodically flush stats when memcg is disabled The root memcg is onlined even when memcg is disabled. When it's onlined a 2 second periodic stat flush is started, but no stat flushing is required when memcg is disabled because there can be no child memcgs. Most calls to flush memcg stats are avoided when memcg is disabled as a result of the mem_cgroup_disabled check added in 7d7ef0a4686a ("mm: memcg: restore subtree stats flushing"), but the periodic flushing started in mem_cgroup_css_online is not. Skip it. Link: https://lkml.kernel.org/r/20240126211927.1171338-1-tjmercier@google.com Fixes: aa48e47e3906 ("memcg: infrastructure to flush memcg stats") Signed-off-by: T.J. Mercier Acked-by: Shakeel Butt Acked-by: Johannes Weiner Acked-by: Chris Li Reported-by: Minchan Kim Reviewed-by: Yosry Ahmed Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Signed-off-by: Andrew Morton --- mm/memcontrol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 391ecdc5af68a0..eb8684269906e9 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5622,7 +5622,7 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css) if (alloc_shrinker_info(memcg)) goto offline_kmem; - if (unlikely(mem_cgroup_is_root(memcg))) + if (unlikely(mem_cgroup_is_root(memcg)) && !mem_cgroup_disabled()) queue_delayed_work(system_unbound_wq, &stats_flush_dwork, FLUSH_TIME); lru_gen_online_memcg(memcg); From 8f12806c067b2d57bb3ed6ce0037dba110118ce0 Mon Sep 17 00:00:00 2001 From: Levi Yun Date: Fri, 26 Jan 2024 15:25:54 +0000 Subject: [PATCH 504/707] kswapd: replace try_to_freeze() with kthread_freezable_should_stop() Instead of using try_to_freeze, use kthread_freezable_should_stop in kswapd. By this, we can avoid unnecessary freezing when kswapd should stop. Link: https://lkml.kernel.org/r/20240126152556.58791-1-ppbuk5246@gmail.com Signed-off-by: Levi Yun Signed-off-by: Andrew Morton --- mm/vmscan.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 4f9c854ce6cc66..1f139830b26f6c 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -6796,6 +6796,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx) bool raise_priority = true; bool balanced; bool ret; + bool was_frozen; sc.reclaim_idx = highest_zoneidx; @@ -6894,9 +6895,9 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx) /* Check if kswapd should be suspending */ __fs_reclaim_release(_THIS_IP_); - ret = try_to_freeze(); + ret = kthread_freezable_should_stop(&was_frozen); __fs_reclaim_acquire(_THIS_IP_); - if (ret || kthread_should_stop()) + if (was_frozen || ret) break; /* @@ -7102,7 +7103,7 @@ static int kswapd(void *p) WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES); atomic_set(&pgdat->nr_writeback_throttled, 0); for ( ; ; ) { - bool ret; + bool was_frozen; alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order); highest_zoneidx = kswapd_highest_zoneidx(pgdat, @@ -7119,15 +7120,14 @@ static int kswapd(void *p) WRITE_ONCE(pgdat->kswapd_order, 0); WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES); - ret = try_to_freeze(); - if (kthread_should_stop()) + if (kthread_freezable_should_stop(&was_frozen)) break; /* * We can speed up thawing tasks if we don't call balance_pgdat * after returning from the refrigerator */ - if (ret) + if (was_frozen) continue; /* From f7279801ff46b58e6bd25aecfbea28e88f42a090 Mon Sep 17 00:00:00 2001 From: Gang Li Date: Fri, 26 Jan 2024 23:24:05 +0800 Subject: [PATCH 505/707] hugetlb: code clean for hugetlb_hstate_alloc_pages Patch series "hugetlb: parallelize hugetlb page init on boot", v5. # Introduction Hugetlb initialization during boot takes up a considerable amount of time. For instance, on a 2TB system, initializing 1,800 1GB huge pages takes 1-2 seconds out of 10 seconds. Initializing 11,776 1GB pages on a 12TB Intel host takes more than 1 minute[1]. This is a noteworthy figure. Inspired by [2] and [3], hugetlb initialization can also be accelerated through parallelization. Kernel already has infrastructure like padata_do_multithreaded, this patch uses it to achieve effective results by minimal modifications. [1] https://lore.kernel.org/all/783f8bac-55b8-5b95-eb6a-11a583675000@google.com/ [2] https://lore.kernel.org/all/20200527173608.2885243-1-daniel.m.jordan@oracle.com/ [3] https://lore.kernel.org/all/20230906112605.2286994-1-usama.arif@bytedance.com/ [4] https://lore.kernel.org/all/76becfc1-e609-e3e8-2966-4053143170b6@google.com/ # max_threads This patch use `padata_do_multithreaded` like this: ``` job.max_threads = num_node_state(N_MEMORY) * multiplier; padata_do_multithreaded(&job); ``` To fully utilize the CPU, the number of parallel threads needs to be carefully considered. `max_threads = num_node_state(N_MEMORY)` does not fully utilize the CPU, so we need to multiply it by a multiplier. Tests below indicate that a multiplier of 2 significantly improves performance, and although larger values also provide improvements, the gains are marginal. multiplier 1 2 3 4 5 ------------ ------- ------- ------- ------- ------- 256G 2node 358ms 215ms 157ms 134ms 126ms 2T 4node 979ms 679ms 543ms 489ms 481ms 50G 2node 71ms 44ms 37ms 30ms 31ms Therefore, choosing 2 as the multiplier strikes a good balance between enhancing parallel processing capabilities and maintaining efficient resource management. # Test result test case no patch(ms) patched(ms) saved ------------------- -------------- ------------- -------- 256c2T(4 node) 1G 4745 2024 57.34% 128c1T(2 node) 1G 3358 1712 49.02% 12T 1G 77000 18300 76.23% 256c2T(4 node) 2M 3336 1051 68.52% 128c1T(2 node) 2M 1943 716 63.15% This patch (of 7): The readability of `hugetlb_hstate_alloc_pages` is poor. By cleaning the code, its readability can be improved, facilitating future modifications. This patch extracts two functions to reduce the complexity of `hugetlb_hstate_alloc_pages` and has no functional changes. - hugetlb_hstate_alloc_pages_node_specific() to handle iterates through each online node and performs allocation if necessary. - hugetlb_hstate_alloc_pages_report() report error during allocation. And the value of h->max_huge_pages is updated accordingly. Link: https://lkml.kernel.org/r/20240126152411.1238072-1-gang.li@linux.dev Link: https://lkml.kernel.org/r/20240126152411.1238072-2-gang.li@linux.dev Signed-off-by: Gang Li Tested-by: David Rientjes Reviewed-by: Muchun Song Reviewed-by: Tim Chen Cc: David Hildenbrand Cc: Mike Kravetz Signed-off-by: Andrew Morton --- mm/hugetlb.c | 46 +++++++++++++++++++++++++++++----------------- 1 file changed, 29 insertions(+), 17 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index ed1581b670d42e..b8e4a6adefd67c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3482,6 +3482,33 @@ static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid) h->max_huge_pages_node[nid] = i; } +static bool __init hugetlb_hstate_alloc_pages_specific_nodes(struct hstate *h) +{ + int i; + bool node_specific_alloc = false; + + for_each_online_node(i) { + if (h->max_huge_pages_node[i] > 0) { + hugetlb_hstate_alloc_pages_onenode(h, i); + node_specific_alloc = true; + } + } + + return node_specific_alloc; +} + +static void __init hugetlb_hstate_alloc_pages_errcheck(unsigned long allocated, struct hstate *h) +{ + if (allocated < h->max_huge_pages) { + char buf[32]; + + string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32); + pr_warn("HugeTLB: allocating %lu of page size %s failed. Only allocated %lu hugepages.\n", + h->max_huge_pages, buf, allocated); + h->max_huge_pages = allocated; + } +} + /* * NOTE: this routine is called in different contexts for gigantic and * non-gigantic pages. @@ -3499,7 +3526,6 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h) struct folio *folio; LIST_HEAD(folio_list); nodemask_t *node_alloc_noretry; - bool node_specific_alloc = false; /* skip gigantic hugepages allocation if hugetlb_cma enabled */ if (hstate_is_gigantic(h) && hugetlb_cma_size) { @@ -3508,14 +3534,7 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h) } /* do node specific alloc */ - for_each_online_node(i) { - if (h->max_huge_pages_node[i] > 0) { - hugetlb_hstate_alloc_pages_onenode(h, i); - node_specific_alloc = true; - } - } - - if (node_specific_alloc) + if (hugetlb_hstate_alloc_pages_specific_nodes(h)) return; /* below will do all node balanced alloc */ @@ -3558,14 +3577,7 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h) /* list will be empty if hstate_is_gigantic */ prep_and_add_allocated_folios(h, &folio_list); - if (i < h->max_huge_pages) { - char buf[32]; - - string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32); - pr_warn("HugeTLB: allocating %lu of page size %s failed. Only allocated %lu hugepages.\n", - h->max_huge_pages, buf, i); - h->max_huge_pages = i; - } + hugetlb_hstate_alloc_pages_errcheck(i, h); kfree(node_alloc_noretry); } From cfdf0faaf3f0960554a6c8dda2d68c8fab8aad32 Mon Sep 17 00:00:00 2001 From: Gang Li Date: Fri, 26 Jan 2024 23:24:06 +0800 Subject: [PATCH 506/707] hugetlb: split hugetlb_hstate_alloc_pages 1G and 2M huge pages have different allocation and initialization logic, which leads to subtle differences in parallelization. Therefore, it is appropriate to split hugetlb_hstate_alloc_pages into gigantic and non-gigantic. This patch has no functional changes. Link: https://lkml.kernel.org/r/20240126152411.1238072-3-gang.li@linux.dev Signed-off-by: Gang Li Tested-by: David Rientjes Reviewed-by: Tim Chen Reviewed-by: Muchun Song Cc: David Hildenbrand Cc: Mike Kravetz Signed-off-by: Andrew Morton --- mm/hugetlb.c | 87 ++++++++++++++++++++++++++-------------------------- 1 file changed, 43 insertions(+), 44 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index b8e4a6adefd67c..98ae108e1fac56 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3509,6 +3509,43 @@ static void __init hugetlb_hstate_alloc_pages_errcheck(unsigned long allocated, } } +static unsigned long __init hugetlb_gigantic_pages_alloc_boot(struct hstate *h) +{ + unsigned long i; + + for (i = 0; i < h->max_huge_pages; ++i) { + if (!alloc_bootmem_huge_page(h, NUMA_NO_NODE)) + break; + cond_resched(); + } + + return i; +} + +static unsigned long __init hugetlb_pages_alloc_boot(struct hstate *h) +{ + unsigned long i; + struct folio *folio; + LIST_HEAD(folio_list); + nodemask_t node_alloc_noretry; + + /* Bit mask controlling how hard we retry per-node allocations.*/ + nodes_clear(node_alloc_noretry); + + for (i = 0; i < h->max_huge_pages; ++i) { + folio = alloc_pool_huge_folio(h, &node_states[N_MEMORY], + &node_alloc_noretry); + if (!folio) + break; + list_add(&folio->lru, &folio_list); + cond_resched(); + } + + prep_and_add_allocated_folios(h, &folio_list); + + return i; +} + /* * NOTE: this routine is called in different contexts for gigantic and * non-gigantic pages. @@ -3522,10 +3559,7 @@ static void __init hugetlb_hstate_alloc_pages_errcheck(unsigned long allocated, */ static void __init hugetlb_hstate_alloc_pages(struct hstate *h) { - unsigned long i; - struct folio *folio; - LIST_HEAD(folio_list); - nodemask_t *node_alloc_noretry; + unsigned long allocated; /* skip gigantic hugepages allocation if hugetlb_cma enabled */ if (hstate_is_gigantic(h) && hugetlb_cma_size) { @@ -3538,47 +3572,12 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h) return; /* below will do all node balanced alloc */ - if (!hstate_is_gigantic(h)) { - /* - * Bit mask controlling how hard we retry per-node allocations. - * Ignore errors as lower level routines can deal with - * node_alloc_noretry == NULL. If this kmalloc fails at boot - * time, we are likely in bigger trouble. - */ - node_alloc_noretry = kmalloc(sizeof(*node_alloc_noretry), - GFP_KERNEL); - } else { - /* allocations done at boot time */ - node_alloc_noretry = NULL; - } - - /* bit mask controlling how hard we retry per-node allocations */ - if (node_alloc_noretry) - nodes_clear(*node_alloc_noretry); - - for (i = 0; i < h->max_huge_pages; ++i) { - if (hstate_is_gigantic(h)) { - /* - * gigantic pages not added to list as they are not - * added to pools now. - */ - if (!alloc_bootmem_huge_page(h, NUMA_NO_NODE)) - break; - } else { - folio = alloc_pool_huge_folio(h, &node_states[N_MEMORY], - node_alloc_noretry); - if (!folio) - break; - list_add(&folio->lru, &folio_list); - } - cond_resched(); - } - - /* list will be empty if hstate_is_gigantic */ - prep_and_add_allocated_folios(h, &folio_list); + if (hstate_is_gigantic(h)) + allocated = hugetlb_gigantic_pages_alloc_boot(h); + else + allocated = hugetlb_pages_alloc_boot(h); - hugetlb_hstate_alloc_pages_errcheck(i, h); - kfree(node_alloc_noretry); + hugetlb_hstate_alloc_pages_errcheck(allocated, h); } static void __init hugetlb_init_hstates(void) From 507b57b2daef420d2fe13c62f507b14387ef7445 Mon Sep 17 00:00:00 2001 From: Gang Li Date: Fri, 26 Jan 2024 23:24:07 +0800 Subject: [PATCH 507/707] padata: dispatch works on different nodes When a group of tasks that access different nodes are scheduled on the same node, they may encounter bandwidth bottlenecks and access latency. Thus, numa_aware flag is introduced here, allowing tasks to be distributed across different nodes to fully utilize the advantage of multi-node systems. Link: https://lkml.kernel.org/r/20240126152411.1238072-4-gang.li@linux.dev Signed-off-by: Gang Li Tested-by: David Rientjes Reviewed-by: Muchun Song Reviewed-by: Tim Chen Cc: David Hildenbrand Cc: Mike Kravetz Signed-off-by: Andrew Morton --- include/linux/padata.h | 2 ++ kernel/padata.c | 14 ++++++++++++-- mm/mm_init.c | 1 + 3 files changed, 15 insertions(+), 2 deletions(-) diff --git a/include/linux/padata.h b/include/linux/padata.h index 495b16b6b4d729..8f418711351bcc 100644 --- a/include/linux/padata.h +++ b/include/linux/padata.h @@ -137,6 +137,7 @@ struct padata_shell { * appropriate for one worker thread to do at once. * @max_threads: Max threads to use for the job, actual number may be less * depending on task size and minimum chunk size. + * @numa_aware: Distribute jobs to different nodes with CPU in a round robin fashion. */ struct padata_mt_job { void (*thread_fn)(unsigned long start, unsigned long end, void *arg); @@ -146,6 +147,7 @@ struct padata_mt_job { unsigned long align; unsigned long min_chunk; int max_threads; + bool numa_aware; }; /** diff --git a/kernel/padata.c b/kernel/padata.c index 179fb1518070c2..e3f639ff16707a 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -485,7 +485,8 @@ void __init padata_do_multithreaded(struct padata_mt_job *job) struct padata_work my_work, *pw; struct padata_mt_job_state ps; LIST_HEAD(works); - int nworks; + int nworks, nid; + static atomic_t last_used_nid __initdata; if (job->size == 0) return; @@ -517,7 +518,16 @@ void __init padata_do_multithreaded(struct padata_mt_job *job) ps.chunk_size = roundup(ps.chunk_size, job->align); list_for_each_entry(pw, &works, pw_list) - queue_work(system_unbound_wq, &pw->pw_work); + if (job->numa_aware) { + int old_node = atomic_read(&last_used_nid); + + do { + nid = next_node_in(old_node, node_states[N_CPU]); + } while (!atomic_try_cmpxchg(&last_used_nid, &old_node, nid)); + queue_work_node(nid, system_unbound_wq, &pw->pw_work); + } else { + queue_work(system_unbound_wq, &pw->pw_work); + } /* Use the current thread, which saves starting a workqueue worker. */ padata_work_init(&my_work, padata_mt_helper, &ps, PADATA_WORK_ONSTACK); diff --git a/mm/mm_init.c b/mm/mm_init.c index 2c19f5515e36c4..549e76af8f82a8 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -2231,6 +2231,7 @@ static int __init deferred_init_memmap(void *data) .align = PAGES_PER_SECTION, .min_chunk = PAGES_PER_SECTION, .max_threads = max_threads, + .numa_aware = false, }; padata_do_multithreaded(&job); From 5018d25e9febad6d03f532bf0167b08296c87545 Mon Sep 17 00:00:00 2001 From: Gang Li Date: Fri, 26 Jan 2024 23:24:08 +0800 Subject: [PATCH 508/707] hugetlb: pass *next_nid_to_alloc directly to for_each_node_mask_to_alloc With parallelization of hugetlb allocation across different threads, each thread works on a differnet node to allocate pages from, instead of all allocating from a common node h->next_nid_to_alloc. To address this, it's necessary to assign a separate next_nid_to_alloc for each thread. Consequently, the hstate_next_node_to_alloc and for_each_node_mask_to_alloc have been modified to directly accept a *next_nid_to_alloc parameter, ensuring thread-specific allocation and avoiding concurrent access issues. Link: https://lkml.kernel.org/r/20240126152411.1238072-5-gang.li@linux.dev Signed-off-by: Gang Li Tested-by: David Rientjes Reviewed-by: Tim Chen Reviewed-by: Muchun Song Cc: David Hildenbrand Cc: Mike Kravetz Signed-off-by: Andrew Morton --- mm/hugetlb.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 98ae108e1fac56..effe5539e545c7 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1464,15 +1464,15 @@ static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed) * next node from which to allocate, handling wrap at end of node * mask. */ -static int hstate_next_node_to_alloc(struct hstate *h, +static int hstate_next_node_to_alloc(int *next_node, nodemask_t *nodes_allowed) { int nid; VM_BUG_ON(!nodes_allowed); - nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed); - h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed); + nid = get_valid_node_allowed(*next_node, nodes_allowed); + *next_node = next_node_allowed(nid, nodes_allowed); return nid; } @@ -1495,10 +1495,10 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) return nid; } -#define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask) \ +#define for_each_node_mask_to_alloc(next_node, nr_nodes, node, mask) \ for (nr_nodes = nodes_weight(*mask); \ nr_nodes > 0 && \ - ((node = hstate_next_node_to_alloc(hs, mask)) || 1); \ + ((node = hstate_next_node_to_alloc(next_node, mask)) || 1); \ nr_nodes--) #define for_each_node_mask_to_free(hs, nr_nodes, node, mask) \ @@ -2350,12 +2350,13 @@ static void prep_and_add_allocated_folios(struct hstate *h, */ static struct folio *alloc_pool_huge_folio(struct hstate *h, nodemask_t *nodes_allowed, - nodemask_t *node_alloc_noretry) + nodemask_t *node_alloc_noretry, + int *next_node) { gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; int nr_nodes, node; - for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) { + for_each_node_mask_to_alloc(next_node, nr_nodes, node, nodes_allowed) { struct folio *folio; folio = only_alloc_fresh_hugetlb_folio(h, gfp_mask, node, @@ -3310,7 +3311,7 @@ int __alloc_bootmem_huge_page(struct hstate *h, int nid) goto found; } /* allocate from next node when distributing huge pages */ - for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) { + for_each_node_mask_to_alloc(&h->next_nid_to_alloc, nr_nodes, node, &node_states[N_MEMORY]) { m = memblock_alloc_try_nid_raw( huge_page_size(h), huge_page_size(h), 0, MEMBLOCK_ALLOC_ACCESSIBLE, node); @@ -3679,7 +3680,7 @@ static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed, VM_BUG_ON(delta != -1 && delta != 1); if (delta < 0) { - for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) { + for_each_node_mask_to_alloc(&h->next_nid_to_alloc, nr_nodes, node, nodes_allowed) { if (h->surplus_huge_pages_node[node]) goto found; } @@ -3794,7 +3795,8 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, cond_resched(); folio = alloc_pool_huge_folio(h, nodes_allowed, - node_alloc_noretry); + node_alloc_noretry, + &h->next_nid_to_alloc); if (!folio) { prep_and_add_allocated_folios(h, &page_list); spin_lock_irq(&hugetlb_lock); From 4391e4679bd7a229f4bd97a2f27e57de7d05da15 Mon Sep 17 00:00:00 2001 From: Gang Li Date: Fri, 26 Jan 2024 23:24:09 +0800 Subject: [PATCH 509/707] hugetlb: have CONFIG_HUGETLBFS select CONFIG_PADATA Allow hugetlb use padata_do_multithreaded for parallel initialization. Select CONFIG_PADATA in this case. Link: https://lkml.kernel.org/r/20240126152411.1238072-6-gang.li@linux.dev Signed-off-by: Gang Li Tested-by: David Rientjes Reviewed-by: Muchun Song Cc: David Hildenbrand Cc: Mike Kravetz Cc: Tim Chen Signed-off-by: Andrew Morton --- fs/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/Kconfig b/fs/Kconfig index 89fdbefd1075f8..a57d6e6c41e6f1 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -262,6 +262,7 @@ menuconfig HUGETLBFS depends on X86 || SPARC64 || ARCH_SUPPORTS_HUGETLBFS || BROKEN depends on (SYSFS || SYSCTL) select MEMFD_CREATE + select PADATA help hugetlbfs is a filesystem backing for HugeTLB pages, based on ramfs. For architectures that support it, say Y here and read From b1a6f2b5f1827c8df2beb9da7505b46fa48a636f Mon Sep 17 00:00:00 2001 From: Gang Li Date: Fri, 26 Jan 2024 23:24:10 +0800 Subject: [PATCH 510/707] hugetlb: parallelize 2M hugetlb allocation and initialization By distributing both the allocation and the initialization tasks across multiple threads, the initialization of 2M hugetlb will be faster, thereby improving the boot speed. Here are some test results: test case no patch(ms) patched(ms) saved ------------------- -------------- ------------- -------- 256c2T(4 node) 2M 3336 1051 68.52% 128c1T(2 node) 2M 1943 716 63.15% Link: https://lkml.kernel.org/r/20240126152411.1238072-7-gang.li@linux.dev Signed-off-by: Gang Li Tested-by: David Rientjes Reviewed-by: Muchun Song Cc: David Hildenbrand Cc: Mike Kravetz Cc: Tim Chen Signed-off-by: Andrew Morton --- mm/hugetlb.c | 73 ++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 56 insertions(+), 17 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index effe5539e545c7..19d4dce2642bb1 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #include @@ -3510,6 +3511,30 @@ static void __init hugetlb_hstate_alloc_pages_errcheck(unsigned long allocated, } } +static void __init hugetlb_pages_alloc_boot_node(unsigned long start, unsigned long end, void *arg) +{ + struct hstate *h = (struct hstate *)arg; + int i, num = end - start; + nodemask_t node_alloc_noretry; + LIST_HEAD(folio_list); + int next_node = first_online_node; + + /* Bit mask controlling how hard we retry per-node allocations.*/ + nodes_clear(node_alloc_noretry); + + for (i = 0; i < num; ++i) { + struct folio *folio = alloc_pool_huge_folio(h, &node_states[N_MEMORY], + &node_alloc_noretry, &next_node); + if (!folio) + break; + + list_move(&folio->lru, &folio_list); + cond_resched(); + } + + prep_and_add_allocated_folios(h, &folio_list); +} + static unsigned long __init hugetlb_gigantic_pages_alloc_boot(struct hstate *h) { unsigned long i; @@ -3525,26 +3550,40 @@ static unsigned long __init hugetlb_gigantic_pages_alloc_boot(struct hstate *h) static unsigned long __init hugetlb_pages_alloc_boot(struct hstate *h) { - unsigned long i; - struct folio *folio; - LIST_HEAD(folio_list); - nodemask_t node_alloc_noretry; - - /* Bit mask controlling how hard we retry per-node allocations.*/ - nodes_clear(node_alloc_noretry); + struct padata_mt_job job = { + .fn_arg = h, + .align = 1, + .numa_aware = true + }; - for (i = 0; i < h->max_huge_pages; ++i) { - folio = alloc_pool_huge_folio(h, &node_states[N_MEMORY], - &node_alloc_noretry); - if (!folio) - break; - list_add(&folio->lru, &folio_list); - cond_resched(); - } + job.thread_fn = hugetlb_pages_alloc_boot_node; + job.start = 0; + job.size = h->max_huge_pages; - prep_and_add_allocated_folios(h, &folio_list); + /* + * job.max_threads is twice the num_node_state(N_MEMORY), + * + * Tests below indicate that a multiplier of 2 significantly improves + * performance, and although larger values also provide improvements, + * the gains are marginal. + * + * Therefore, choosing 2 as the multiplier strikes a good balance between + * enhancing parallel processing capabilities and maintaining efficient + * resource management. + * + * +------------+-------+-------+-------+-------+-------+ + * | multiplier | 1 | 2 | 3 | 4 | 5 | + * +------------+-------+-------+-------+-------+-------+ + * | 256G 2node | 358ms | 215ms | 157ms | 134ms | 126ms | + * | 2T 4node | 979ms | 679ms | 543ms | 489ms | 481ms | + * | 50G 2node | 71ms | 44ms | 37ms | 30ms | 31ms | + * +------------+-------+-------+-------+-------+-------+ + */ + job.max_threads = num_node_state(N_MEMORY) * 2; + job.min_chunk = h->max_huge_pages / num_node_state(N_MEMORY) / 2; + padata_do_multithreaded(&job); - return i; + return h->nr_huge_pages; } /* From 8b9c1350115da318a97836423f73f484a200b077 Mon Sep 17 00:00:00 2001 From: Gang Li Date: Fri, 26 Jan 2024 23:24:11 +0800 Subject: [PATCH 511/707] hugetlb: parallelize 1G hugetlb initialization Optimize the initialization speed of 1G huge pages through parallelization. 1G hugetlbs are allocated from bootmem, a process that is already very fast and does not currently require optimization. Therefore, we focus on parallelizing only the initialization phase in `gather_bootmem_prealloc`. Here are some test results: test case no patch(ms) patched(ms) saved ------------------- -------------- ------------- -------- 256c2T(4 node) 1G 4745 2024 57.34% 128c1T(2 node) 1G 3358 1712 49.02% 12T 1G 77000 18300 76.23% Link: https://lkml.kernel.org/r/20240126152411.1238072-8-gang.li@linux.dev Signed-off-by: Gang Li Tested-by: David Rientjes Reviewed-by: Muchun Song Cc: David Hildenbrand Cc: Mike Kravetz Cc: Tim Chen Signed-off-by: Andrew Morton --- arch/powerpc/mm/hugetlbpage.c | 2 +- include/linux/hugetlb.h | 2 +- mm/hugetlb.c | 44 ++++++++++++++++++++++++++++------- 3 files changed, 38 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 0a540b37aab62c..a1651d54718626 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -226,7 +226,7 @@ static int __init pseries_alloc_bootmem_huge_page(struct hstate *hstate) return 0; m = phys_to_virt(gpage_freearray[--nr_gpages]); gpage_freearray[nr_gpages] = 0; - list_add(&m->list, &huge_boot_pages); + list_add(&m->list, &huge_boot_pages[0]); m->hstate = hstate; return 1; } diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index c1ee640d87b11d..77b30a8c6076b6 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -178,7 +178,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage); extern int sysctl_hugetlb_shm_group; -extern struct list_head huge_boot_pages; +extern struct list_head huge_boot_pages[MAX_NUMNODES]; /* arch callbacks */ diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 19d4dce2642bb1..9d996fe4ecd9cc 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -69,7 +69,7 @@ static bool hugetlb_cma_folio(struct folio *folio, unsigned int order) #endif static unsigned long hugetlb_cma_size __initdata; -__initdata LIST_HEAD(huge_boot_pages); +__initdata struct list_head huge_boot_pages[MAX_NUMNODES]; /* for command line parsing */ static struct hstate * __initdata parsed_hstate; @@ -3301,7 +3301,7 @@ int alloc_bootmem_huge_page(struct hstate *h, int nid) int __alloc_bootmem_huge_page(struct hstate *h, int nid) { struct huge_bootmem_page *m = NULL; /* initialize for clang */ - int nr_nodes, node; + int nr_nodes, node = nid; /* do node specific alloc */ if (nid != NUMA_NO_NODE) { @@ -3339,7 +3339,7 @@ int __alloc_bootmem_huge_page(struct hstate *h, int nid) huge_page_size(h) - PAGE_SIZE); /* Put them into a private list first because mem_map is not up yet */ INIT_LIST_HEAD(&m->list); - list_add(&m->list, &huge_boot_pages); + list_add(&m->list, &huge_boot_pages[node]); m->hstate = h; return 1; } @@ -3390,8 +3390,6 @@ static void __init prep_and_add_bootmem_folios(struct hstate *h, /* Send list for bulk vmemmap optimization processing */ hugetlb_vmemmap_optimize_folios(h, folio_list); - /* Add all new pool pages to free lists in one lock cycle */ - spin_lock_irqsave(&hugetlb_lock, flags); list_for_each_entry_safe(folio, tmp_f, folio_list, lru) { if (!folio_test_hugetlb_vmemmap_optimized(folio)) { /* @@ -3404,23 +3402,27 @@ static void __init prep_and_add_bootmem_folios(struct hstate *h, HUGETLB_VMEMMAP_RESERVE_PAGES, pages_per_huge_page(h)); } + /* Subdivide locks to achieve better parallel performance */ + spin_lock_irqsave(&hugetlb_lock, flags); __prep_account_new_huge_page(h, folio_nid(folio)); enqueue_hugetlb_folio(h, folio); + spin_unlock_irqrestore(&hugetlb_lock, flags); } - spin_unlock_irqrestore(&hugetlb_lock, flags); } /* * Put bootmem huge pages into the standard lists after mem_map is up. * Note: This only applies to gigantic (order > MAX_PAGE_ORDER) pages. */ -static void __init gather_bootmem_prealloc(void) +static void __init gather_bootmem_prealloc_node(unsigned long start, unsigned long end, void *arg) + { + int nid = start; LIST_HEAD(folio_list); struct huge_bootmem_page *m; struct hstate *h = NULL, *prev_h = NULL; - list_for_each_entry(m, &huge_boot_pages, list) { + list_for_each_entry(m, &huge_boot_pages[nid], list) { struct page *page = virt_to_page(m); struct folio *folio = (void *)page; @@ -3453,6 +3455,22 @@ static void __init gather_bootmem_prealloc(void) prep_and_add_bootmem_folios(h, &folio_list); } +static void __init gather_bootmem_prealloc(void) +{ + struct padata_mt_job job = { + .thread_fn = gather_bootmem_prealloc_node, + .fn_arg = NULL, + .start = 0, + .size = num_node_state(N_MEMORY), + .align = 1, + .min_chunk = 1, + .max_threads = num_node_state(N_MEMORY), + .numa_aware = true, + }; + + padata_do_multithreaded(&job); +} + static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid) { unsigned long i; @@ -3600,6 +3618,7 @@ static unsigned long __init hugetlb_pages_alloc_boot(struct hstate *h) static void __init hugetlb_hstate_alloc_pages(struct hstate *h) { unsigned long allocated; + static bool initialied __initdata; /* skip gigantic hugepages allocation if hugetlb_cma enabled */ if (hstate_is_gigantic(h) && hugetlb_cma_size) { @@ -3607,6 +3626,15 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h) return; } + /* hugetlb_hstate_alloc_pages will be called many times, initialize huge_boot_pages once */ + if (!initialied) { + int i = 0; + + for (i = 0; i < MAX_NUMNODES; i++) + INIT_LIST_HEAD(&huge_boot_pages[i]); + initialied = true; + } + /* do node specific alloc */ if (hugetlb_hstate_alloc_pages_specific_nodes(h)) return; From 59711c69e692f6408ffe3e6f826609837709c406 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Fri, 26 Jan 2024 16:19:44 +0800 Subject: [PATCH 512/707] mm and cache_info: remove unnecessary CPU cache info update For each CPU hotplug event, we will update per-CPU data slice size and corresponding PCP configuration for every online CPU to make the implementation simple. But, Kyle reported that this takes tens seconds during boot on a machine with 34 zones and 3840 CPUs. So, in this patch, for each CPU hotplug event, we only update per-CPU data slice size and corresponding PCP configuration for the CPUs that share caches with the hotplugged CPU. With the patch, the system boot time reduces 67 seconds on the machine. Link: https://lkml.kernel.org/r/20240126081944.414520-1-ying.huang@intel.com Fixes: 362d37a106dd ("mm, pcp: reduce lock contention for draining high-order pages") Signed-off-by: "Huang, Ying" Originally-by: Kyle Meyer Reported-and-tested-by: Kyle Meyer Cc: Sudeep Holla Cc: Mel Gorman Signed-off-by: Andrew Morton --- drivers/base/cacheinfo.c | 50 +++++++++++++++++++++++++++++++++++----- include/linux/gfp.h | 2 +- mm/page_alloc.c | 39 +++++++++++++++---------------- 3 files changed, 63 insertions(+), 28 deletions(-) diff --git a/drivers/base/cacheinfo.c b/drivers/base/cacheinfo.c index f1e79263fe61eb..23b8cba4a2a3b8 100644 --- a/drivers/base/cacheinfo.c +++ b/drivers/base/cacheinfo.c @@ -898,6 +898,37 @@ static int cache_add_dev(unsigned int cpu) return rc; } +static unsigned int cpu_map_shared_cache(bool online, unsigned int cpu, + cpumask_t **map) +{ + struct cacheinfo *llc, *sib_llc; + unsigned int sibling; + + if (!last_level_cache_is_valid(cpu)) + return 0; + + llc = per_cpu_cacheinfo_idx(cpu, cache_leaves(cpu) - 1); + + if (llc->type != CACHE_TYPE_DATA && llc->type != CACHE_TYPE_UNIFIED) + return 0; + + if (online) { + *map = &llc->shared_cpu_map; + return cpumask_weight(*map); + } + + /* shared_cpu_map of offlined CPU will be cleared, so use sibling map */ + for_each_cpu(sibling, &llc->shared_cpu_map) { + if (sibling == cpu || !last_level_cache_is_valid(sibling)) + continue; + sib_llc = per_cpu_cacheinfo_idx(sibling, cache_leaves(sibling) - 1); + *map = &sib_llc->shared_cpu_map; + return cpumask_weight(*map); + } + + return 0; +} + /* * Calculate the size of the per-CPU data cache slice. This can be * used to estimate the size of the data cache slice that can be used @@ -929,28 +960,31 @@ static void update_per_cpu_data_slice_size_cpu(unsigned int cpu) ci->per_cpu_data_slice_size = llc->size / nr_shared; } -static void update_per_cpu_data_slice_size(bool cpu_online, unsigned int cpu) +static void update_per_cpu_data_slice_size(bool cpu_online, unsigned int cpu, + cpumask_t *cpu_map) { unsigned int icpu; - for_each_online_cpu(icpu) { + for_each_cpu(icpu, cpu_map) { if (!cpu_online && icpu == cpu) continue; update_per_cpu_data_slice_size_cpu(icpu); + setup_pcp_cacheinfo(icpu); } } static int cacheinfo_cpu_online(unsigned int cpu) { int rc = detect_cache_attributes(cpu); + cpumask_t *cpu_map; if (rc) return rc; rc = cache_add_dev(cpu); if (rc) goto err; - update_per_cpu_data_slice_size(true, cpu); - setup_pcp_cacheinfo(); + if (cpu_map_shared_cache(true, cpu, &cpu_map)) + update_per_cpu_data_slice_size(true, cpu, cpu_map); return 0; err: free_cache_attributes(cpu); @@ -959,12 +993,16 @@ static int cacheinfo_cpu_online(unsigned int cpu) static int cacheinfo_cpu_pre_down(unsigned int cpu) { + cpumask_t *cpu_map; + unsigned int nr_shared; + + nr_shared = cpu_map_shared_cache(false, cpu, &cpu_map); if (cpumask_test_and_clear_cpu(cpu, &cache_dev_map)) cpu_cache_sysfs_exit(cpu); free_cache_attributes(cpu); - update_per_cpu_data_slice_size(false, cpu); - setup_pcp_cacheinfo(); + if (nr_shared > 1) + update_per_cpu_data_slice_size(false, cpu, cpu_map); return 0; } diff --git a/include/linux/gfp.h b/include/linux/gfp.h index de292a0071389e..09e22091f1b03f 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -334,7 +334,7 @@ void drain_all_pages(struct zone *zone); void drain_local_pages(struct zone *zone); void page_alloc_init_late(void); -void setup_pcp_cacheinfo(void); +void setup_pcp_cacheinfo(unsigned int cpu); /* * gfp_allowed_mask is set to GFP_BOOT_MASK during early boot to restrict what diff --git a/mm/page_alloc.c b/mm/page_alloc.c index aeda1153dad92b..140c4f372db169 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5578,37 +5578,34 @@ static void zone_pcp_update(struct zone *zone, int cpu_online) mutex_unlock(&pcp_batch_high_lock); } -static void zone_pcp_update_cacheinfo(struct zone *zone) +static void zone_pcp_update_cacheinfo(struct zone *zone, unsigned int cpu) { - int cpu; struct per_cpu_pages *pcp; struct cpu_cacheinfo *cci; - for_each_online_cpu(cpu) { - pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); - cci = get_cpu_cacheinfo(cpu); - /* - * If data cache slice of CPU is large enough, "pcp->batch" - * pages can be preserved in PCP before draining PCP for - * consecutive high-order pages freeing without allocation. - * This can reduce zone lock contention without hurting - * cache-hot pages sharing. - */ - spin_lock(&pcp->lock); - if ((cci->per_cpu_data_slice_size >> PAGE_SHIFT) > 3 * pcp->batch) - pcp->flags |= PCPF_FREE_HIGH_BATCH; - else - pcp->flags &= ~PCPF_FREE_HIGH_BATCH; - spin_unlock(&pcp->lock); - } + pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); + cci = get_cpu_cacheinfo(cpu); + /* + * If data cache slice of CPU is large enough, "pcp->batch" + * pages can be preserved in PCP before draining PCP for + * consecutive high-order pages freeing without allocation. + * This can reduce zone lock contention without hurting + * cache-hot pages sharing. + */ + spin_lock(&pcp->lock); + if ((cci->per_cpu_data_slice_size >> PAGE_SHIFT) > 3 * pcp->batch) + pcp->flags |= PCPF_FREE_HIGH_BATCH; + else + pcp->flags &= ~PCPF_FREE_HIGH_BATCH; + spin_unlock(&pcp->lock); } -void setup_pcp_cacheinfo(void) +void setup_pcp_cacheinfo(unsigned int cpu) { struct zone *zone; for_each_populated_zone(zone) - zone_pcp_update_cacheinfo(zone); + zone_pcp_update_cacheinfo(zone, cpu); } /* From 45ae2add29e2853f4dae8d37324c7f820921b583 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Fri, 26 Jan 2024 08:06:43 +0000 Subject: [PATCH 513/707] x86/mm: delete unused cpu argument to leave_mm() The argument is unused since commit 3d28ebceaffa ("x86/mm: Rework lazy TLB to track the actual loaded mm"), delete it. Link: https://lkml.kernel.org/r/20240126080644.1714297-1-yosryahmed@google.com Signed-off-by: Yosry Ahmed Cc: Andy Lutomirski Cc: Borislav Petkov (AMD) Cc: Dave Hansen Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- arch/x86/include/asm/mmu.h | 2 +- arch/x86/kernel/alternative.c | 2 +- arch/x86/mm/tlb.c | 2 +- arch/x86/xen/mmu_pv.c | 2 +- drivers/cpuidle/cpuidle.c | 2 +- include/linux/mmu_context.h | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h index 0da5c227f490c0..ce4677b8b7356c 100644 --- a/arch/x86/include/asm/mmu.h +++ b/arch/x86/include/asm/mmu.h @@ -75,7 +75,7 @@ typedef struct { .lock = __MUTEX_INITIALIZER(mm.context.lock), \ } -void leave_mm(int cpu); +void leave_mm(void); #define leave_mm leave_mm #endif /* _ASM_X86_MMU_H */ diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index cc130b57542ac4..66bd265c7a5876 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -1805,7 +1805,7 @@ static inline temp_mm_state_t use_temporary_mm(struct mm_struct *mm) * restoring the previous mm. */ if (this_cpu_read(cpu_tlbstate_shared.is_lazy)) - leave_mm(smp_processor_id()); + leave_mm(); temp_state.mm = this_cpu_read(cpu_tlbstate.loaded_mm); switch_mm_irqs_off(NULL, mm, current); diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 5768d386efab6e..80b0caa82a91b4 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -299,7 +299,7 @@ static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, unsigned long lam, write_cr3(new_mm_cr3); } -void leave_mm(int cpu) +void leave_mm(void) { struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index 72af496a160c8b..218773cfb009f7 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -913,7 +913,7 @@ static void drop_mm_ref_this_cpu(void *info) struct mm_struct *mm = info; if (this_cpu_read(cpu_tlbstate.loaded_mm) == mm) - leave_mm(smp_processor_id()); + leave_mm(); /* * If this cpu still has a stale cr3 reference, then make sure diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index 737a026ef58a38..02e40fd7d948c9 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -237,7 +237,7 @@ noinstr int cpuidle_enter_state(struct cpuidle_device *dev, } if (target_state->flags & CPUIDLE_FLAG_TLB_FLUSHED) - leave_mm(dev->cpu); + leave_mm(); /* Take note of the planned idle state. */ sched_idle_set_state(target_state); diff --git a/include/linux/mmu_context.h b/include/linux/mmu_context.h index f2b7a3f040999e..bbaec80c78c505 100644 --- a/include/linux/mmu_context.h +++ b/include/linux/mmu_context.h @@ -11,7 +11,7 @@ #endif #ifndef leave_mm -static inline void leave_mm(int cpu) { } +static inline void leave_mm(void) { } #endif /* From 6e2bbaa08ca958915e1da968b64327ea6ddc38e2 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Fri, 26 Jan 2024 08:06:44 +0000 Subject: [PATCH 514/707] x86/mm: clarify "prev" usage in switch_mm_irqs_off() In the x86 implementation of switch_mm_irqs_off(), we do not use the "prev" argument passed in by the caller, we use exclusively use "real_prev", which is cpu_tlbstate.loaded_mm. This is not obvious at the first sight. Furthermore, a comment describes a condition that happens when called with prev == next, but this should not affect the function in any way since prev is unused. Apparently, the comment is intended to clarify why we don't rely on prev == next to decide whether we need to update CR3, but again, it is not obvious. The comment also references the fact that leave_mm() calls with prev == NULL and tsk == NULL, but this also shouldn't matter because prev is unused and tsk is only used in one function which has a NULL check. Clarify things by renaming (prev -> unused) and (real_prev -> prev), also move and rewrite the comment as an explanation for why we don't rely on "prev" supplied by the caller in x86 code and use our own. Hopefully this makes reading the code easier. Link: https://lkml.kernel.org/r/20240126080644.1714297-2-yosryahmed@google.com Signed-off-by: Yosry Ahmed Cc: Andy Lutomirski Cc: Borislav Petkov (AMD) Cc: Dave Hansen Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- arch/x86/mm/tlb.c | 35 ++++++++++++++++------------------- 1 file changed, 16 insertions(+), 19 deletions(-) diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 80b0caa82a91b4..bf9605caf24f74 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -492,10 +492,16 @@ void cr4_update_pce(void *ignored) static inline void cr4_update_pce_mm(struct mm_struct *mm) { } #endif -void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, +/* + * The "prev" argument passed by the caller does not always match CR3. For + * example, the scheduler passes in active_mm when switching from lazy TLB mode + * to normal mode, but switch_mm_irqs_off() can be called from x86 code without + * updating active_mm. Use cpu_tlbstate.loaded_mm instead. + */ +void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next, struct task_struct *tsk) { - struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm); + struct mm_struct *prev = this_cpu_read(cpu_tlbstate.loaded_mm); u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); unsigned long new_lam = mm_lam_cr3_mask(next); bool was_lazy = this_cpu_read(cpu_tlbstate_shared.is_lazy); @@ -504,15 +510,6 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, bool need_flush; u16 new_asid; - /* - * NB: The scheduler will call us with prev == next when switching - * from lazy TLB mode to normal mode if active_mm isn't changing. - * When this happens, we don't assume that CR3 (and hence - * cpu_tlbstate.loaded_mm) matches next. - * - * NB: leave_mm() calls us with prev == NULL and tsk == NULL. - */ - /* We don't want flush_tlb_func() to run concurrently with us. */ if (IS_ENABLED(CONFIG_PROVE_LOCKING)) WARN_ON_ONCE(!irqs_disabled()); @@ -527,7 +524,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, * isn't free. */ #ifdef CONFIG_DEBUG_VM - if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev->pgd, prev_asid, + if (WARN_ON_ONCE(__read_cr3() != build_cr3(prev->pgd, prev_asid, tlbstate_lam_cr3_mask()))) { /* * If we were to BUG here, we'd be very likely to kill @@ -559,7 +556,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, * provides that full memory barrier and core serializing * instruction. */ - if (real_prev == next) { + if (prev == next) { /* Not actually switching mm's */ VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) != next->context.ctx_id); @@ -574,7 +571,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, * mm_cpumask. The TLB shootdown code can figure out from * cpu_tlbstate_shared.is_lazy whether or not to send an IPI. */ - if (WARN_ON_ONCE(real_prev != &init_mm && + if (WARN_ON_ONCE(prev != &init_mm && !cpumask_test_cpu(cpu, mm_cpumask(next)))) cpumask_set_cpu(cpu, mm_cpumask(next)); @@ -616,10 +613,10 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, * Skip kernel threads; we never send init_mm TLB flushing IPIs, * but the bitmap manipulation can cause cache line contention. */ - if (real_prev != &init_mm) { + if (prev != &init_mm) { VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu, - mm_cpumask(real_prev))); - cpumask_clear_cpu(cpu, mm_cpumask(real_prev)); + mm_cpumask(prev))); + cpumask_clear_cpu(cpu, mm_cpumask(prev)); } /* @@ -656,9 +653,9 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, this_cpu_write(cpu_tlbstate.loaded_mm, next); this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid); - if (next != real_prev) { + if (next != prev) { cr4_update_pce_mm(next); - switch_ldt(real_prev, next); + switch_ldt(prev, next); } } From 9cf78c52c93d7b16d2b83df1ed6ca1ccb123ef2e Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Sun, 28 Jan 2024 13:28:50 +0000 Subject: [PATCH 515/707] mm/zswap: fix race between lru writeback and swapoff LRU writeback has race problem with swapoff, as spotted by Yosry [1]: CPU1 CPU2 shrink_memcg_cb swap_off list_lru_isolate zswap_invalidate zswap_swapoff kfree(tree) // UAF spin_lock(&tree->lock) The problem is that the entry in lru list can't protect the tree from being swapoff and freed, and the entry also can be invalidated and freed concurrently after we unlock the lru lock. We can fix it by moving the swap cache allocation ahead before referencing the tree, then check invalidate race with tree lock, only after that we can safely deref the entry. Note we couldn't deref entry or tree anymore after we unlock the folio, since we depend on this to hold on swapoff. So this patch moves all tree and entry usage to zswap_writeback_entry(), we only use the copied swpentry on the stack to allocate swap cache and if returned with folio locked we can reference the tree safely. Then we can check invalidate race with tree lock, the following things is much the same like zswap_load(). Since we can't deref the entry after zswap_writeback_entry(), we can't use zswap_lru_putback() anymore, instead we rotate the entry in the beginning. And it will be unlinked and freed when invalidated if writeback success. Another change is we don't update the memcg nr_zswap_protected in the -ENOMEM and -EEXIST cases anymore. -EEXIST case means we raced with swapin or concurrent shrinker action, since swapin already have memcg nr_zswap_protected updated, don't need double counts here. For concurrent shrinker, the folio will be writeback and freed anyway. -ENOMEM case is extremely rare and doesn't happen spuriously either, so don't bother distinguishing this case. [1] https://lore.kernel.org/all/CAJD7tkasHsRnT_75-TXsEe58V9_OW6m3g6CF7Kmsvz8CKRG_EA@mail.gmail.com/ Link: https://lkml.kernel.org/r/20240126-zswap-writeback-race-v2-2-b10479847099@bytedance.com Signed-off-by: Chengming Zhou Acked-by: Johannes Weiner Acked-by: Nhat Pham Cc: Chris Li Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- mm/zswap.c | 114 +++++++++++++++++++++++------------------------------ 1 file changed, 49 insertions(+), 65 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 50a4af3f185607..fcd31fb670cce2 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -277,7 +277,7 @@ static inline struct zswap_tree *swap_zswap_tree(swp_entry_t swp) zpool_get_type((p)->zpools[0])) static int zswap_writeback_entry(struct zswap_entry *entry, - struct zswap_tree *tree); + swp_entry_t swpentry); static int zswap_pool_get(struct zswap_pool *pool); static void zswap_pool_put(struct zswap_pool *pool); @@ -445,27 +445,6 @@ static void zswap_lru_del(struct list_lru *list_lru, struct zswap_entry *entry) rcu_read_unlock(); } -static void zswap_lru_putback(struct list_lru *list_lru, - struct zswap_entry *entry) -{ - int nid = entry_to_nid(entry); - spinlock_t *lock = &list_lru->node[nid].lock; - struct mem_cgroup *memcg; - struct lruvec *lruvec; - - rcu_read_lock(); - memcg = mem_cgroup_from_entry(entry); - spin_lock(lock); - /* we cannot use list_lru_add here, because it increments node's lru count */ - list_lru_putback(list_lru, &entry->lru, nid, memcg); - spin_unlock(lock); - - lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(entry_to_nid(entry))); - /* increment the protection area to account for the LRU rotation. */ - atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected); - rcu_read_unlock(); -} - /********************************* * rbtree functions **********************************/ @@ -860,40 +839,47 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o { struct zswap_entry *entry = container_of(item, struct zswap_entry, lru); bool *encountered_page_in_swapcache = (bool *)arg; - struct zswap_tree *tree; - pgoff_t swpoffset; + swp_entry_t swpentry; enum lru_status ret = LRU_REMOVED_RETRY; int writeback_result; + /* + * Rotate the entry to the tail before unlocking the LRU, + * so that in case of an invalidation race concurrent + * reclaimers don't waste their time on it. + * + * If writeback succeeds, or failure is due to the entry + * being invalidated by the swap subsystem, the invalidation + * will unlink and free it. + * + * Temporary failures, where the same entry should be tried + * again immediately, almost never happen for this shrinker. + * We don't do any trylocking; -ENOMEM comes closest, + * but that's extremely rare and doesn't happen spuriously + * either. Don't bother distinguishing this case. + * + * But since they do exist in theory, the entry cannot just + * be unlinked, or we could leak it. Hence, rotate. + */ + list_move_tail(item, &l->list); + /* * Once the lru lock is dropped, the entry might get freed. The - * swpoffset is copied to the stack, and entry isn't deref'd again + * swpentry is copied to the stack, and entry isn't deref'd again * until the entry is verified to still be alive in the tree. */ - swpoffset = swp_offset(entry->swpentry); - tree = swap_zswap_tree(entry->swpentry); - list_lru_isolate(l, item); + swpentry = entry->swpentry; + /* * It's safe to drop the lock here because we return either * LRU_REMOVED_RETRY or LRU_RETRY. */ spin_unlock(lock); - /* Check for invalidate() race */ - spin_lock(&tree->lock); - if (entry != zswap_rb_search(&tree->rbroot, swpoffset)) - goto unlock; - - /* Hold a reference to prevent a free during writeback */ - zswap_entry_get(entry); - spin_unlock(&tree->lock); - - writeback_result = zswap_writeback_entry(entry, tree); + writeback_result = zswap_writeback_entry(entry, swpentry); - spin_lock(&tree->lock); if (writeback_result) { zswap_reject_reclaim_fail++; - zswap_lru_putback(&entry->pool->list_lru, entry); ret = LRU_RETRY; /* @@ -903,27 +889,10 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o */ if (writeback_result == -EEXIST && encountered_page_in_swapcache) *encountered_page_in_swapcache = true; - - goto put_unlock; + } else { + zswap_written_back_pages++; } - zswap_written_back_pages++; - - if (entry->objcg) - count_objcg_event(entry->objcg, ZSWPWB); - count_vm_event(ZSWPWB); - /* - * Writeback started successfully, the page now belongs to the - * swapcache. Drop the entry from zswap - unless invalidate already - * took it out while we had the tree->lock released for IO. - */ - zswap_invalidate_entry(tree, entry); - -put_unlock: - /* Drop local reference */ - zswap_entry_put(entry); -unlock: - spin_unlock(&tree->lock); spin_lock(lock); return ret; } @@ -1408,9 +1377,9 @@ static void __zswap_load(struct zswap_entry *entry, struct page *page) * freed. */ static int zswap_writeback_entry(struct zswap_entry *entry, - struct zswap_tree *tree) + swp_entry_t swpentry) { - swp_entry_t swpentry = entry->swpentry; + struct zswap_tree *tree; struct folio *folio; struct mempolicy *mpol; bool folio_was_allocated; @@ -1426,9 +1395,11 @@ static int zswap_writeback_entry(struct zswap_entry *entry, return -ENOMEM; /* - * Found an existing folio, we raced with load/swapin. We generally - * writeback cold folios from zswap, and swapin means the folio just - * became hot. Skip this folio and let the caller find another one. + * Found an existing folio, we raced with swapin or concurrent + * shrinker. We generally writeback cold folios from zswap, and + * swapin means the folio just became hot, so skip this folio. + * For unlikely concurrent shrinker case, it will be unlinked + * and freed when invalidated by the concurrent shrinker anyway. */ if (!folio_was_allocated) { folio_put(folio); @@ -1442,18 +1413,31 @@ static int zswap_writeback_entry(struct zswap_entry *entry, * backs (our zswap_entry reference doesn't prevent that), to * avoid overwriting a new swap folio with old compressed data. */ + tree = swap_zswap_tree(swpentry); spin_lock(&tree->lock); - if (zswap_rb_search(&tree->rbroot, swp_offset(entry->swpentry)) != entry) { + if (zswap_rb_search(&tree->rbroot, swp_offset(swpentry)) != entry) { spin_unlock(&tree->lock); delete_from_swap_cache(folio); folio_unlock(folio); folio_put(folio); return -ENOMEM; } + + /* Safe to deref entry after the entry is verified above. */ + zswap_entry_get(entry); spin_unlock(&tree->lock); __zswap_load(entry, &folio->page); + count_vm_event(ZSWPWB); + if (entry->objcg) + count_objcg_event(entry->objcg, ZSWPWB); + + spin_lock(&tree->lock); + zswap_invalidate_entry(tree, entry); + zswap_entry_put(entry); + spin_unlock(&tree->lock); + /* folio is up to date */ folio_mark_uptodate(folio); From 26a7f1648f07aa0c5c856b64b5b82b1233f4f9cd Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Sun, 28 Jan 2024 13:28:51 +0000 Subject: [PATCH 516/707] mm/list_lru: remove list_lru_putback() Since the only user zswap_lru_putback() has gone, remove list_lru_putback() too. Link: https://lkml.kernel.org/r/20240126-zswap-writeback-race-v2-3-b10479847099@bytedance.com Signed-off-by: Chengming Zhou Acked-by: Yosry Ahmed Cc: Chris Li Cc: Johannes Weiner Cc: Nhat Pham Signed-off-by: Andrew Morton --- include/linux/list_lru.h | 16 ---------------- mm/list_lru.c | 14 -------------- mm/zswap.c | 2 +- 3 files changed, 1 insertion(+), 31 deletions(-) diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h index c679e6b293c4c4..f2882a82069027 100644 --- a/include/linux/list_lru.h +++ b/include/linux/list_lru.h @@ -168,22 +168,6 @@ static inline unsigned long list_lru_count(struct list_lru *lru) void list_lru_isolate(struct list_lru_one *list, struct list_head *item); void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item, struct list_head *head); -/** - * list_lru_putback: undo list_lru_isolate - * @lru: the lru pointer. - * @item: the item to put back. - * @nid: the node id of the sublist to put the item back to. - * @memcg: the cgroup of the sublist to put the item back to. - * - * Put back an isolated item into its original LRU. Note that unlike - * list_lru_add, this does not increment the node LRU count (as - * list_lru_isolate does not originally decrement this count). - * - * Since we might have dropped the LRU lock in between, recompute list_lru_one - * from the node's id and memcg. - */ -void list_lru_putback(struct list_lru *lru, struct list_head *item, int nid, - struct mem_cgroup *memcg); typedef enum lru_status (*list_lru_walk_cb)(struct list_head *item, struct list_lru_one *list, spinlock_t *lock, void *cb_arg); diff --git a/mm/list_lru.c b/mm/list_lru.c index 158781d1d3c215..61f3b6b1134fbe 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -162,20 +162,6 @@ void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item, } EXPORT_SYMBOL_GPL(list_lru_isolate_move); -void list_lru_putback(struct list_lru *lru, struct list_head *item, int nid, - struct mem_cgroup *memcg) -{ - struct list_lru_one *list = - list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg)); - - if (list_empty(item)) { - list_add_tail(item, &list->list); - if (!list->nr_items++) - set_shrinker_bit(memcg, nid, lru_shrinker_id(lru)); - } -} -EXPORT_SYMBOL_GPL(list_lru_putback); - unsigned long list_lru_count_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg) { diff --git a/mm/zswap.c b/mm/zswap.c index fcd31fb670cce2..7f88b3a77e4a84 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -411,7 +411,7 @@ static void zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry) * 2. list_lru_add() is called after memcg->kmemcg_id is updated. The * new entry will be added directly to memcg's parent's list_lru. * - * Similar reasoning holds for list_lru_del() and list_lru_putback(). + * Similar reasoning holds for list_lru_del(). */ rcu_read_lock(); memcg = mem_cgroup_from_entry(entry); From d869d3fb362c51a59c173fdee050dc100ff68383 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Mon, 29 Jan 2024 11:07:01 +0100 Subject: [PATCH 517/707] stackdepot: use variable size records for non-evictable entries With the introduction of stack depot evictions, each stack record is now fixed size, so that future reuse after an eviction can safely store differently sized stack traces. In all cases that do not make use of evictions, this wastes lots of space. Fix it by re-introducing variable size stack records (up to the max allowed size) for entries that will never be evicted. We know if an entry will never be evicted if the flag STACK_DEPOT_FLAG_GET is not provided, since a later stack_depot_put() attempt is undefined behavior. With my current kernel config that enables KASAN and also SLUB owner tracking, I observe (after a kernel boot) a whopping reduction of 296 stack depot pools, which translates into 4736 KiB saved. The savings here are from SLUB owner tracking only, because KASAN generic mode still uses refcounting. Before: pools: 893 allocations: 29841 frees: 6524 in_use: 23317 freelist_size: 3454 After: pools: 597 refcounted_allocations: 17547 refcounted_frees: 6477 refcounted_in_use: 11070 freelist_size: 3497 persistent_count: 12163 persistent_bytes: 1717008 Link: https://lkml.kernel.org/r/20240129100708.39460-1-elver@google.com Fixes: 108be8def46e ("lib/stackdepot: allow users to evict stack traces") Signed-off-by: Marco Elver Reviewed-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Andrey Ryabinin Cc: Vincenzo Frascino Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/poison.h | 3 + lib/stackdepot.c | 250 +++++++++++++++++++++-------------------- 2 files changed, 130 insertions(+), 123 deletions(-) diff --git a/include/linux/poison.h b/include/linux/poison.h index 27a7dad17eefb8..1f0ee2459f2aa2 100644 --- a/include/linux/poison.h +++ b/include/linux/poison.h @@ -92,4 +92,7 @@ /********** VFS **********/ #define VFS_PTR_POISON ((void *)(0xF5 + POISON_POINTER_DELTA)) +/********** lib/stackdepot.c **********/ +#define STACK_DEPOT_POISON ((void *)(0xD390 + POISON_POINTER_DELTA)) + #endif diff --git a/lib/stackdepot.c b/lib/stackdepot.c index 5caa1f56655384..8f3b2c84ec2db3 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -43,17 +44,7 @@ #define DEPOT_OFFSET_BITS (DEPOT_POOL_ORDER + PAGE_SHIFT - DEPOT_STACK_ALIGN) #define DEPOT_POOL_INDEX_BITS (DEPOT_HANDLE_BITS - DEPOT_OFFSET_BITS - \ STACK_DEPOT_EXTRA_BITS) -#if IS_ENABLED(CONFIG_KMSAN) && CONFIG_STACKDEPOT_MAX_FRAMES >= 32 -/* - * KMSAN is frequently used in fuzzing scenarios and thus saves a lot of stack - * traces. As KMSAN does not support evicting stack traces from the stack - * depot, the stack depot capacity might be reached quickly with large stack - * records. Adjust the maximum number of stack depot pools for this case. - */ -#define DEPOT_POOLS_CAP (8192 * (CONFIG_STACKDEPOT_MAX_FRAMES / 16)) -#else #define DEPOT_POOLS_CAP 8192 -#endif #define DEPOT_MAX_POOLS \ (((1LL << (DEPOT_POOL_INDEX_BITS)) < DEPOT_POOLS_CAP) ? \ (1LL << (DEPOT_POOL_INDEX_BITS)) : DEPOT_POOLS_CAP) @@ -93,9 +84,6 @@ struct stack_record { }; }; -#define DEPOT_STACK_RECORD_SIZE \ - ALIGN(sizeof(struct stack_record), 1 << DEPOT_STACK_ALIGN) - static bool stack_depot_disabled; static bool __stack_depot_early_init_requested __initdata = IS_ENABLED(CONFIG_STACKDEPOT_ALWAYS_INIT); static bool __stack_depot_early_init_passed __initdata; @@ -121,32 +109,31 @@ static void *stack_pools[DEPOT_MAX_POOLS]; static void *new_pool; /* Number of pools in stack_pools. */ static int pools_num; +/* Offset to the unused space in the currently used pool. */ +static size_t pool_offset = DEPOT_POOL_SIZE; /* Freelist of stack records within stack_pools. */ static LIST_HEAD(free_stacks); -/* - * Stack depot tries to keep an extra pool allocated even before it runs out - * of space in the currently used pool. This flag marks whether this extra pool - * needs to be allocated. It has the value 0 when either an extra pool is not - * yet allocated or if the limit on the number of pools is reached. - */ -static bool new_pool_required = true; /* The lock must be held when performing pool or freelist modifications. */ static DEFINE_RAW_SPINLOCK(pool_lock); /* Statistics counters for debugfs. */ enum depot_counter_id { - DEPOT_COUNTER_ALLOCS, - DEPOT_COUNTER_FREES, - DEPOT_COUNTER_INUSE, + DEPOT_COUNTER_REFD_ALLOCS, + DEPOT_COUNTER_REFD_FREES, + DEPOT_COUNTER_REFD_INUSE, DEPOT_COUNTER_FREELIST_SIZE, + DEPOT_COUNTER_PERSIST_COUNT, + DEPOT_COUNTER_PERSIST_BYTES, DEPOT_COUNTER_COUNT, }; static long counters[DEPOT_COUNTER_COUNT]; static const char *const counter_names[] = { - [DEPOT_COUNTER_ALLOCS] = "allocations", - [DEPOT_COUNTER_FREES] = "frees", - [DEPOT_COUNTER_INUSE] = "in_use", + [DEPOT_COUNTER_REFD_ALLOCS] = "refcounted_allocations", + [DEPOT_COUNTER_REFD_FREES] = "refcounted_frees", + [DEPOT_COUNTER_REFD_INUSE] = "refcounted_in_use", [DEPOT_COUNTER_FREELIST_SIZE] = "freelist_size", + [DEPOT_COUNTER_PERSIST_COUNT] = "persistent_count", + [DEPOT_COUNTER_PERSIST_BYTES] = "persistent_bytes", }; static_assert(ARRAY_SIZE(counter_names) == DEPOT_COUNTER_COUNT); @@ -294,48 +281,52 @@ int stack_depot_init(void) EXPORT_SYMBOL_GPL(stack_depot_init); /* - * Initializes new stack depot @pool, release all its entries to the freelist, - * and update the list of pools. + * Initializes new stack pool, and updates the list of pools. */ -static void depot_init_pool(void *pool) +static bool depot_init_pool(void **prealloc) { - int offset; - lockdep_assert_held(&pool_lock); - /* Initialize handles and link stack records into the freelist. */ - for (offset = 0; offset <= DEPOT_POOL_SIZE - DEPOT_STACK_RECORD_SIZE; - offset += DEPOT_STACK_RECORD_SIZE) { - struct stack_record *stack = pool + offset; - - stack->handle.pool_index = pools_num; - stack->handle.offset = offset >> DEPOT_STACK_ALIGN; - stack->handle.extra = 0; - - /* - * Stack traces of size 0 are never saved, and we can simply use - * the size field as an indicator if this is a new unused stack - * record in the freelist. - */ - stack->size = 0; + if (unlikely(pools_num >= DEPOT_MAX_POOLS)) { + /* Bail out if we reached the pool limit. */ + WARN_ON_ONCE(pools_num > DEPOT_MAX_POOLS); /* should never happen */ + WARN_ON_ONCE(!new_pool); /* to avoid unnecessary pre-allocation */ + WARN_ONCE(1, "Stack depot reached limit capacity"); + return false; + } - INIT_LIST_HEAD(&stack->hash_list); - /* - * Add to the freelist front to prioritize never-used entries: - * required in case there are entries in the freelist, but their - * RCU cookie still belongs to the current RCU grace period - * (there can still be concurrent readers). - */ - list_add(&stack->free_list, &free_stacks); - counters[DEPOT_COUNTER_FREELIST_SIZE]++; + if (!new_pool && *prealloc) { + /* We have preallocated memory, use it. */ + WRITE_ONCE(new_pool, *prealloc); + *prealloc = NULL; } + if (!new_pool) + return false; /* new_pool and *prealloc are NULL */ + /* Save reference to the pool to be used by depot_fetch_stack(). */ - stack_pools[pools_num] = pool; + stack_pools[pools_num] = new_pool; + + /* + * Stack depot tries to keep an extra pool allocated even before it runs + * out of space in the currently used pool. + * + * To indicate that a new preallocation is needed new_pool is reset to + * NULL; do not reset to NULL if we have reached the maximum number of + * pools. + */ + if (pools_num < DEPOT_MAX_POOLS) + WRITE_ONCE(new_pool, NULL); + else + WRITE_ONCE(new_pool, STACK_DEPOT_POISON); /* Pairs with concurrent READ_ONCE() in depot_fetch_stack(). */ WRITE_ONCE(pools_num, pools_num + 1); ASSERT_EXCLUSIVE_WRITER(pools_num); + + pool_offset = 0; + + return true; } /* Keeps the preallocated memory to be used for a new stack depot pool. */ @@ -347,63 +338,51 @@ static void depot_keep_new_pool(void **prealloc) * If a new pool is already saved or the maximum number of * pools is reached, do not use the preallocated memory. */ - if (!new_pool_required) + if (new_pool) return; - /* - * Use the preallocated memory for the new pool - * as long as we do not exceed the maximum number of pools. - */ - if (pools_num < DEPOT_MAX_POOLS) { - new_pool = *prealloc; - *prealloc = NULL; - } - - /* - * At this point, either a new pool is kept or the maximum - * number of pools is reached. In either case, take note that - * keeping another pool is not required. - */ - WRITE_ONCE(new_pool_required, false); + WRITE_ONCE(new_pool, *prealloc); + *prealloc = NULL; } /* - * Try to initialize a new stack depot pool from either a previous or the - * current pre-allocation, and release all its entries to the freelist. + * Try to initialize a new stack record from the current pool, a cached pool, or + * the current pre-allocation. */ -static bool depot_try_init_pool(void **prealloc) +static struct stack_record *depot_pop_free_pool(void **prealloc, size_t size) { + struct stack_record *stack; + void *current_pool; + u32 pool_index; + lockdep_assert_held(&pool_lock); - /* Check if we have a new pool saved and use it. */ - if (new_pool) { - depot_init_pool(new_pool); - new_pool = NULL; + if (pool_offset + size > DEPOT_POOL_SIZE) { + if (!depot_init_pool(prealloc)) + return NULL; + } - /* Take note that we might need a new new_pool. */ - if (pools_num < DEPOT_MAX_POOLS) - WRITE_ONCE(new_pool_required, true); + if (WARN_ON_ONCE(pools_num < 1)) + return NULL; + pool_index = pools_num - 1; + current_pool = stack_pools[pool_index]; + if (WARN_ON_ONCE(!current_pool)) + return NULL; - return true; - } + stack = current_pool + pool_offset; - /* Bail out if we reached the pool limit. */ - if (unlikely(pools_num >= DEPOT_MAX_POOLS)) { - WARN_ONCE(1, "Stack depot reached limit capacity"); - return false; - } + /* Pre-initialize handle once. */ + stack->handle.pool_index = pool_index; + stack->handle.offset = pool_offset >> DEPOT_STACK_ALIGN; + stack->handle.extra = 0; + INIT_LIST_HEAD(&stack->hash_list); - /* Check if we have preallocated memory and use it. */ - if (*prealloc) { - depot_init_pool(*prealloc); - *prealloc = NULL; - return true; - } + pool_offset += size; - return false; + return stack; } -/* Try to find next free usable entry. */ +/* Try to find next free usable entry from the freelist. */ static struct stack_record *depot_pop_free(void) { struct stack_record *stack; @@ -420,7 +399,7 @@ static struct stack_record *depot_pop_free(void) * check the first entry. */ stack = list_first_entry(&free_stacks, struct stack_record, free_list); - if (stack->size && !poll_state_synchronize_rcu(stack->rcu_state)) + if (!poll_state_synchronize_rcu(stack->rcu_state)) return NULL; list_del(&stack->free_list); @@ -429,48 +408,73 @@ static struct stack_record *depot_pop_free(void) return stack; } +static inline size_t depot_stack_record_size(struct stack_record *s, unsigned int nr_entries) +{ + const size_t used = flex_array_size(s, entries, nr_entries); + const size_t unused = sizeof(s->entries) - used; + + WARN_ON_ONCE(sizeof(s->entries) < used); + + return ALIGN(sizeof(struct stack_record) - unused, 1 << DEPOT_STACK_ALIGN); +} + /* Allocates a new stack in a stack depot pool. */ static struct stack_record * -depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc) +depot_alloc_stack(unsigned long *entries, int nr_entries, u32 hash, depot_flags_t flags, void **prealloc) { - struct stack_record *stack; + struct stack_record *stack = NULL; + size_t record_size; lockdep_assert_held(&pool_lock); /* This should already be checked by public API entry points. */ - if (WARN_ON_ONCE(!size)) + if (WARN_ON_ONCE(!nr_entries)) return NULL; - /* Check if we have a stack record to save the stack trace. */ - stack = depot_pop_free(); - if (!stack) { - /* No usable entries on the freelist - try to refill the freelist. */ - if (!depot_try_init_pool(prealloc)) - return NULL; + /* Limit number of saved frames to CONFIG_STACKDEPOT_MAX_FRAMES. */ + if (nr_entries > CONFIG_STACKDEPOT_MAX_FRAMES) + nr_entries = CONFIG_STACKDEPOT_MAX_FRAMES; + + if (flags & STACK_DEPOT_FLAG_GET) { + /* + * Evictable entries have to allocate the max. size so they may + * safely be re-used by differently sized allocations. + */ + record_size = depot_stack_record_size(stack, CONFIG_STACKDEPOT_MAX_FRAMES); stack = depot_pop_free(); - if (WARN_ON(!stack)) - return NULL; + } else { + record_size = depot_stack_record_size(stack, nr_entries); } - /* Limit number of saved frames to CONFIG_STACKDEPOT_MAX_FRAMES. */ - if (size > CONFIG_STACKDEPOT_MAX_FRAMES) - size = CONFIG_STACKDEPOT_MAX_FRAMES; + if (!stack) { + stack = depot_pop_free_pool(prealloc, record_size); + if (!stack) + return NULL; + } /* Save the stack trace. */ stack->hash = hash; - stack->size = size; - /* stack->handle is already filled in by depot_init_pool(). */ - refcount_set(&stack->count, 1); - memcpy(stack->entries, entries, flex_array_size(stack, entries, size)); + stack->size = nr_entries; + /* stack->handle is already filled in by depot_pop_free_pool(). */ + memcpy(stack->entries, entries, flex_array_size(stack, entries, nr_entries)); + + if (flags & STACK_DEPOT_FLAG_GET) { + refcount_set(&stack->count, 1); + counters[DEPOT_COUNTER_REFD_ALLOCS]++; + counters[DEPOT_COUNTER_REFD_INUSE]++; + } else { + /* Warn on attempts to switch to refcounting this entry. */ + refcount_set(&stack->count, REFCOUNT_SATURATED); + counters[DEPOT_COUNTER_PERSIST_COUNT]++; + counters[DEPOT_COUNTER_PERSIST_BYTES] += record_size; + } /* * Let KMSAN know the stored stack record is initialized. This shall * prevent false positive reports if instrumented code accesses it. */ - kmsan_unpoison_memory(stack, DEPOT_STACK_RECORD_SIZE); + kmsan_unpoison_memory(stack, record_size); - counters[DEPOT_COUNTER_ALLOCS]++; - counters[DEPOT_COUNTER_INUSE]++; return stack; } @@ -538,8 +542,8 @@ static void depot_free_stack(struct stack_record *stack) list_add_tail(&stack->free_list, &free_stacks); counters[DEPOT_COUNTER_FREELIST_SIZE]++; - counters[DEPOT_COUNTER_FREES]++; - counters[DEPOT_COUNTER_INUSE]--; + counters[DEPOT_COUNTER_REFD_FREES]++; + counters[DEPOT_COUNTER_REFD_INUSE]--; printk_deferred_exit(); raw_spin_unlock_irqrestore(&pool_lock, flags); @@ -660,7 +664,7 @@ depot_stack_handle_t stack_depot_save_flags(unsigned long *entries, * Allocate memory for a new pool if required now: * we won't be able to do that under the lock. */ - if (unlikely(can_alloc && READ_ONCE(new_pool_required))) { + if (unlikely(can_alloc && !READ_ONCE(new_pool))) { /* * Zero out zone modifiers, as we don't have specific zone * requirements. Keep the flags related to allocation in atomic @@ -681,7 +685,7 @@ depot_stack_handle_t stack_depot_save_flags(unsigned long *entries, found = find_stack(bucket, entries, nr_entries, hash, depot_flags); if (!found) { struct stack_record *new = - depot_alloc_stack(entries, nr_entries, hash, &prealloc); + depot_alloc_stack(entries, nr_entries, hash, depot_flags, &prealloc); if (new) { /* From 2ef8127ce56d76a4f861e7f0c40e241409c18087 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Mon, 29 Jan 2024 11:07:02 +0100 Subject: [PATCH 518/707] kasan: revert eviction of stack traces in generic mode This partially reverts commits cc478e0b6bdf, 63b85ac56a64, 08d7c94d9635, a414d4286f34, and 773688a6cb24 to make use of variable-sized stack depot records, since eviction of stack entries from stack depot forces fixed- sized stack records. Care was taken to retain the code cleanups by the above commits. Eviction was added to generic KASAN as a response to alleviating the additional memory usage from fixed-sized stack records, but this still uses more memory than previously. With the re-introduction of variable-sized records for stack depot, we can just switch back to non-evictable stack records again, and return back to the previous performance and memory usage baseline. Before (observed after a KASAN kernel boot): pools: 597 refcounted_allocations: 17547 refcounted_frees: 6477 refcounted_in_use: 11070 freelist_size: 3497 persistent_count: 12163 persistent_bytes: 1717008 After: pools: 319 refcounted_allocations: 0 refcounted_frees: 0 refcounted_in_use: 0 freelist_size: 0 persistent_count: 29397 persistent_bytes: 5183536 As can be seen from the counters, with a generic KASAN config, refcounted allocations and evictions are no longer used. Due to using variable-sized records, I observe a reduction of 278 stack depot pools (saving 4448 KiB) with my test setup. Link: https://lkml.kernel.org/r/20240129100708.39460-2-elver@google.com Fixes: cc478e0b6bdf ("kasan: avoid resetting aux_lock") Fixes: 63b85ac56a64 ("kasan: stop leaking stack trace handles") Fixes: 08d7c94d9635 ("kasan: memset free track in qlink_free") Fixes: a414d4286f34 ("kasan: handle concurrent kasan_record_aux_stack calls") Fixes: 773688a6cb24 ("kasan: use stack_depot_put for Generic mode") Signed-off-by: Marco Elver Reviewed-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Andrey Ryabinin Cc: Vincenzo Frascino Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/kasan/common.c | 8 ++--- mm/kasan/generic.c | 68 +++++-------------------------------------- mm/kasan/kasan.h | 10 ------- mm/kasan/quarantine.c | 5 +++- 4 files changed, 14 insertions(+), 77 deletions(-) diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 610efae9122094..6ca63e8dda741b 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -65,8 +65,7 @@ void kasan_save_track(struct kasan_track *track, gfp_t flags) { depot_stack_handle_t stack; - stack = kasan_save_stack(flags, - STACK_DEPOT_FLAG_CAN_ALLOC | STACK_DEPOT_FLAG_GET); + stack = kasan_save_stack(flags, STACK_DEPOT_FLAG_CAN_ALLOC); kasan_set_track(track, stack); } @@ -266,10 +265,9 @@ bool __kasan_slab_free(struct kmem_cache *cache, void *object, return true; /* - * If the object is not put into quarantine, it will likely be quickly - * reallocated. Thus, release its metadata now. + * Note: Keep per-object metadata to allow KASAN print stack traces for + * use-after-free-before-realloc bugs. */ - kasan_release_object_meta(cache, object); /* Let slab put the object onto the freelist. */ return false; diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index df6627f62402c0..fc9cf1860efb34 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -485,16 +485,6 @@ void kasan_init_object_meta(struct kmem_cache *cache, const void *object) if (alloc_meta) { /* Zero out alloc meta to mark it as invalid. */ __memset(alloc_meta, 0, sizeof(*alloc_meta)); - - /* - * Prepare the lock for saving auxiliary stack traces. - * Temporarily disable KASAN bug reporting to allow instrumented - * raw_spin_lock_init to access aux_lock, which resides inside - * of a redzone. - */ - kasan_disable_current(); - raw_spin_lock_init(&alloc_meta->aux_lock); - kasan_enable_current(); } /* @@ -506,18 +496,8 @@ void kasan_init_object_meta(struct kmem_cache *cache, const void *object) static void release_alloc_meta(struct kasan_alloc_meta *meta) { - /* Evict the stack traces from stack depot. */ - stack_depot_put(meta->alloc_track.stack); - stack_depot_put(meta->aux_stack[0]); - stack_depot_put(meta->aux_stack[1]); - - /* - * Zero out alloc meta to mark it as invalid but keep aux_lock - * initialized to avoid having to reinitialize it when another object - * is allocated in the same slot. - */ - __memset(&meta->alloc_track, 0, sizeof(meta->alloc_track)); - __memset(meta->aux_stack, 0, sizeof(meta->aux_stack)); + /* Zero out alloc meta to mark it as invalid. */ + __memset(meta, 0, sizeof(*meta)); } static void release_free_meta(const void *object, struct kasan_free_meta *meta) @@ -526,27 +506,10 @@ static void release_free_meta(const void *object, struct kasan_free_meta *meta) if (*(u8 *)kasan_mem_to_shadow(object) != KASAN_SLAB_FREE_META) return; - /* Evict the stack trace from the stack depot. */ - stack_depot_put(meta->free_track.stack); - /* Mark free meta as invalid. */ *(u8 *)kasan_mem_to_shadow(object) = KASAN_SLAB_FREE; } -void kasan_release_object_meta(struct kmem_cache *cache, const void *object) -{ - struct kasan_alloc_meta *alloc_meta; - struct kasan_free_meta *free_meta; - - alloc_meta = kasan_get_alloc_meta(cache, object); - if (alloc_meta) - release_alloc_meta(alloc_meta); - - free_meta = kasan_get_free_meta(cache, object); - if (free_meta) - release_free_meta(object, free_meta); -} - size_t kasan_metadata_size(struct kmem_cache *cache, bool in_object) { struct kasan_cache *info = &cache->kasan_info; @@ -571,8 +534,6 @@ static void __kasan_record_aux_stack(void *addr, depot_flags_t depot_flags) struct kmem_cache *cache; struct kasan_alloc_meta *alloc_meta; void *object; - depot_stack_handle_t new_handle, old_handle; - unsigned long flags; if (is_kfence_address(addr) || !slab) return; @@ -583,33 +544,18 @@ static void __kasan_record_aux_stack(void *addr, depot_flags_t depot_flags) if (!alloc_meta) return; - new_handle = kasan_save_stack(0, depot_flags); - - /* - * Temporarily disable KASAN bug reporting to allow instrumented - * spinlock functions to access aux_lock, which resides inside of a - * redzone. - */ - kasan_disable_current(); - raw_spin_lock_irqsave(&alloc_meta->aux_lock, flags); - old_handle = alloc_meta->aux_stack[1]; alloc_meta->aux_stack[1] = alloc_meta->aux_stack[0]; - alloc_meta->aux_stack[0] = new_handle; - raw_spin_unlock_irqrestore(&alloc_meta->aux_lock, flags); - kasan_enable_current(); - - stack_depot_put(old_handle); + alloc_meta->aux_stack[0] = kasan_save_stack(0, depot_flags); } void kasan_record_aux_stack(void *addr) { - return __kasan_record_aux_stack(addr, - STACK_DEPOT_FLAG_CAN_ALLOC | STACK_DEPOT_FLAG_GET); + return __kasan_record_aux_stack(addr, STACK_DEPOT_FLAG_CAN_ALLOC); } void kasan_record_aux_stack_noalloc(void *addr) { - return __kasan_record_aux_stack(addr, STACK_DEPOT_FLAG_GET); + return __kasan_record_aux_stack(addr, 0); } void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags) @@ -620,7 +566,7 @@ void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags) if (!alloc_meta) return; - /* Evict previous stack traces (might exist for krealloc or mempool). */ + /* Invalidate previous stack traces (might exist for krealloc or mempool). */ release_alloc_meta(alloc_meta); kasan_save_track(&alloc_meta->alloc_track, flags); @@ -634,7 +580,7 @@ void kasan_save_free_info(struct kmem_cache *cache, void *object) if (!free_meta) return; - /* Evict previous stack trace (might exist for mempool). */ + /* Invalidate previous stack trace (might exist for mempool). */ release_free_meta(object, free_meta); kasan_save_track(&free_meta->free_track, 0); diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index d0f172f2b9783f..fb2b9ac0659a7a 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -6,7 +6,6 @@ #include #include #include -#include #include #if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS) @@ -265,13 +264,6 @@ struct kasan_global { struct kasan_alloc_meta { struct kasan_track alloc_track; /* Free track is stored in kasan_free_meta. */ - /* - * aux_lock protects aux_stack from accesses from concurrent - * kasan_record_aux_stack calls. It is a raw spinlock to avoid sleeping - * on RT kernels, as kasan_record_aux_stack_noalloc can be called from - * non-sleepable contexts. - */ - raw_spinlock_t aux_lock; depot_stack_handle_t aux_stack[2]; }; @@ -398,10 +390,8 @@ struct kasan_alloc_meta *kasan_get_alloc_meta(struct kmem_cache *cache, struct kasan_free_meta *kasan_get_free_meta(struct kmem_cache *cache, const void *object); void kasan_init_object_meta(struct kmem_cache *cache, const void *object); -void kasan_release_object_meta(struct kmem_cache *cache, const void *object); #else static inline void kasan_init_object_meta(struct kmem_cache *cache, const void *object) { } -static inline void kasan_release_object_meta(struct kmem_cache *cache, const void *object) { } #endif depot_stack_handle_t kasan_save_stack(gfp_t flags, depot_flags_t depot_flags); diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c index 3ba02efb952aac..6958aa713c67ee 100644 --- a/mm/kasan/quarantine.c +++ b/mm/kasan/quarantine.c @@ -145,7 +145,10 @@ static void qlink_free(struct qlist_node *qlink, struct kmem_cache *cache) void *object = qlink_to_object(qlink, cache); struct kasan_free_meta *free_meta = kasan_get_free_meta(cache, object); - kasan_release_object_meta(cache, object); + /* + * Note: Keep per-object metadata to allow KASAN print stack traces for + * use-after-free-before-realloc bugs. + */ /* * If init_on_free is enabled and KASAN's free metadata is stored in From 5ff5178495b3fd5624543b8bdcdd5ba5e7cafe24 Mon Sep 17 00:00:00 2001 From: Lance Yang Date: Mon, 29 Jan 2024 13:45:51 +0800 Subject: [PATCH 519/707] mm/khugepaged: bypassing unnecessary scans with MMF_DISABLE_THP check khugepaged scans the entire address space in the background for each given mm, looking for opportunities to merge sequences of basic pages into huge pages. However, when an mm is inserted to the mm_slots list, and the MMF_DISABLE_THP flag is set later, this scanning process becomes unnecessary for that mm and can be skipped to avoid redundant operations, especially in scenarios with a large address space. This commit introduces a check before each scanning process to test the MMF_DISABLE_THP flag for the given mm; if the flag is set, the scanning process is bypassed, thereby improving the efficiency of khugepaged. Link: https://lkml.kernel.org/r/20240129054551.57728-1-ioworker0@gmail.com Signed-off-by: Lance Yang Cc: David Hildenbrand Cc: Michal Hocko Cc: Minchan Kim Cc: Muchun Song Cc: Peter Xu Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- mm/khugepaged.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index fe43fbc4452539..2771fc043b3b8b 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -410,6 +410,12 @@ static inline int hpage_collapse_test_exit(struct mm_struct *mm) return atomic_read(&mm->mm_users) == 0; } +static inline int hpage_collapse_test_exit_or_disable(struct mm_struct *mm) +{ + return hpage_collapse_test_exit(mm) || + test_bit(MMF_DISABLE_THP, &mm->flags); +} + void __khugepaged_enter(struct mm_struct *mm) { struct khugepaged_mm_slot *mm_slot; @@ -1422,7 +1428,7 @@ static void collect_mm_slot(struct khugepaged_mm_slot *mm_slot) lockdep_assert_held(&khugepaged_mm_lock); - if (hpage_collapse_test_exit(mm)) { + if (hpage_collapse_test_exit_or_disable(mm)) { /* free mm_slot */ hash_del(&slot->hash); list_del(&slot->mm_node); @@ -2360,7 +2366,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, goto breakouterloop_mmap_lock; progress++; - if (unlikely(hpage_collapse_test_exit(mm))) + if (unlikely(hpage_collapse_test_exit_or_disable(mm))) goto breakouterloop; vma_iter_init(&vmi, mm, khugepaged_scan.address); @@ -2368,7 +2374,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, unsigned long hstart, hend; cond_resched(); - if (unlikely(hpage_collapse_test_exit(mm))) { + if (unlikely(hpage_collapse_test_exit_or_disable(mm))) { progress++; break; } @@ -2390,7 +2396,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, bool mmap_locked = true; cond_resched(); - if (unlikely(hpage_collapse_test_exit(mm))) + if (unlikely(hpage_collapse_test_exit_or_disable(mm))) goto breakouterloop; VM_BUG_ON(khugepaged_scan.address < hstart || @@ -2408,7 +2414,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, fput(file); if (*result == SCAN_PTE_MAPPED_HUGEPAGE) { mmap_read_lock(mm); - if (hpage_collapse_test_exit(mm)) + if (hpage_collapse_test_exit_or_disable(mm)) goto breakouterloop; *result = collapse_pte_mapped_thp(mm, khugepaged_scan.address, false); @@ -2450,7 +2456,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, * Release the current mm_slot if this mm is about to die, or * if we scanned all vmas of this mm. */ - if (hpage_collapse_test_exit(mm) || !vma) { + if (hpage_collapse_test_exit_or_disable(mm) || !vma) { /* * Make sure that if mm_users is reaching zero while * khugepaged runs here, khugepaged_exit will find From 36c6faa33b878745f30c544b2189d43aacec96ef Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Mon, 22 Jan 2024 21:01:53 +0800 Subject: [PATCH 520/707] mm: compaction: limit the suitable target page order to be less than cc->order It can not improve the fragmentation if we isolate the target free pages exceeding cc->order, especially when the cc->order is less than pageblock_order. For example, suppose the pageblock_order is MAX_ORDER (size is 4M) and cc->order is 2M THP size, we should not isolate other 2M free pages to be the migration target, which can not improve the fragmentation. Moreover this is also applicable for large folio compaction. Link: https://lkml.kernel.org/r/afcd9377351c259df7a25a388a4a0d5862b986f4.1705928395.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Cc: Mel Gorman Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/compaction.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/compaction.c b/mm/compaction.c index 335a6f6787e4e3..055687e6bd17b6 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1415,12 +1415,14 @@ static bool suitable_migration_target(struct compact_control *cc, { /* If the page is a large free page, then disallow migration */ if (PageBuddy(page)) { + int order = cc->order > 0 ? cc->order : pageblock_order; + /* * We are checking page_order without zone->lock taken. But * the only small danger is that we skip a potentially suitable * pageblock, so it's not worth to check order for valid range. */ - if (buddy_order_unsafe(page) >= pageblock_order) + if (buddy_order_unsafe(page) >= order) return false; } From 3fba0bccc287b087a2262bc7f53dc7883e34e4e4 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 29 Jan 2024 20:36:37 -0500 Subject: [PATCH 521/707] mm: zswap: rename zswap_free_entry to zswap_entry_free There is a zswap_entry_ namespace with multiple functions already. Link: https://lkml.kernel.org/r/20240130014208.565554-2-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Nhat Pham Acked-by: Yosry Ahmed Reviewed-by: Chengming Zhou Signed-off-by: Andrew Morton --- mm/zswap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 7f88b3a77e4a84..173f2e6657de65 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -520,7 +520,7 @@ static struct zpool *zswap_find_zpool(struct zswap_entry *entry) * Carries out the common pattern of freeing and entry's zpool allocation, * freeing the entry itself, and decrementing the number of stored pages. */ -static void zswap_free_entry(struct zswap_entry *entry) +static void zswap_entry_free(struct zswap_entry *entry) { if (!entry->length) atomic_dec(&zswap_same_filled_pages); @@ -555,7 +555,7 @@ static void zswap_entry_put(struct zswap_entry *entry) WARN_ON_ONCE(refcount < 0); if (refcount == 0) { WARN_ON_ONCE(!RB_EMPTY_NODE(&entry->rbnode)); - zswap_free_entry(entry); + zswap_entry_free(entry); } } From 32068a78f7a4d137f14580e96db2d681342ad853 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 29 Jan 2024 20:36:38 -0500 Subject: [PATCH 522/707] mm: zswap: inline and remove zswap_entry_find_get() There is only one caller and the function is trivial. Inline it. Link: https://lkml.kernel.org/r/20240130014208.565554-3-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Nhat Pham Acked-by: Yosry Ahmed Reviewed-by: Chengming Zhou Signed-off-by: Andrew Morton --- mm/zswap.c | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 173f2e6657de65..cf864aaa214d34 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -559,19 +559,6 @@ static void zswap_entry_put(struct zswap_entry *entry) } } -/* caller must hold the tree lock */ -static struct zswap_entry *zswap_entry_find_get(struct rb_root *root, - pgoff_t offset) -{ - struct zswap_entry *entry; - - entry = zswap_rb_search(root, offset); - if (entry) - zswap_entry_get(entry); - - return entry; -} - /********************************* * shrinker functions **********************************/ @@ -1708,13 +1695,13 @@ bool zswap_load(struct folio *folio) VM_WARN_ON_ONCE(!folio_test_locked(folio)); - /* find */ spin_lock(&tree->lock); - entry = zswap_entry_find_get(&tree->rbroot, offset); + entry = zswap_rb_search(&tree->rbroot, offset); if (!entry) { spin_unlock(&tree->lock); return false; } + zswap_entry_get(entry); spin_unlock(&tree->lock); if (entry->length) From 4231a94f31606062d83b77c5704d3ef20e180b36 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 29 Jan 2024 20:36:39 -0500 Subject: [PATCH 523/707] mm: zswap: move zswap_invalidate_entry() to related functions Move it up to the other tree and refcounting functions. Link: https://lkml.kernel.org/r/20240130014208.565554-4-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Nhat Pham Acked-by: Yosry Ahmed Reviewed-by: Chengming Zhou Signed-off-by: Andrew Morton --- mm/zswap.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index cf864aaa214d34..9f05282efe3c22 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -559,6 +559,18 @@ static void zswap_entry_put(struct zswap_entry *entry) } } +/* + * If the entry is still valid in the tree, drop the initial ref and remove it + * from the tree. This function must be called with an additional ref held, + * otherwise it may race with another invalidation freeing the entry. + */ +static void zswap_invalidate_entry(struct zswap_tree *tree, + struct zswap_entry *entry) +{ + if (zswap_rb_erase(&tree->rbroot, entry)) + zswap_entry_put(entry); +} + /********************************* * shrinker functions **********************************/ @@ -809,18 +821,6 @@ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor) return NULL; } -/* - * If the entry is still valid in the tree, drop the initial ref and remove it - * from the tree. This function must be called with an additional ref held, - * otherwise it may race with another invalidation freeing the entry. - */ -static void zswap_invalidate_entry(struct zswap_tree *tree, - struct zswap_entry *entry) -{ - if (zswap_rb_erase(&tree->rbroot, entry)) - zswap_entry_put(entry); -} - static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l, spinlock_t *lock, void *arg) { From 1a274d5dddf116a0900f4c32c91f9a04bef7103d Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 29 Jan 2024 20:36:40 -0500 Subject: [PATCH 524/707] mm: zswap: warn when referencing a dead entry Put a standard sanity check on zswap_entry_get() for UAF scenario. Link: https://lkml.kernel.org/r/20240130014208.565554-5-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Nhat Pham Acked-by: Yosry Ahmed Reviewed-by: Chengming Zhou Signed-off-by: Andrew Morton --- mm/zswap.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/zswap.c b/mm/zswap.c index 9f05282efe3c22..0c6adaf2fdb6a3 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -542,6 +542,7 @@ static void zswap_entry_free(struct zswap_entry *entry) /* caller must hold the tree lock */ static void zswap_entry_get(struct zswap_entry *entry) { + WARN_ON_ONCE(!entry->refcount); entry->refcount++; } From a2c7d38cf4915b8235839c5451ef82939b3af7df Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 29 Jan 2024 20:36:41 -0500 Subject: [PATCH 525/707] mm: zswap: clean up zswap_entry_put() Remove stale comment and unnecessary local variable. Link: https://lkml.kernel.org/r/20240130014208.565554-6-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Yosry Ahmed Reviewed-by: Nhat Pham Reviewed-by: Chengming Zhou Signed-off-by: Andrew Morton --- mm/zswap.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 0c6adaf2fdb6a3..7a7e8da2b4f8c3 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -546,15 +546,11 @@ static void zswap_entry_get(struct zswap_entry *entry) entry->refcount++; } -/* caller must hold the tree lock -* remove from the tree and free it, if nobody reference the entry -*/ +/* caller must hold the tree lock */ static void zswap_entry_put(struct zswap_entry *entry) { - int refcount = --entry->refcount; - - WARN_ON_ONCE(refcount < 0); - if (refcount == 0) { + WARN_ON_ONCE(!entry->refcount); + if (--entry->refcount == 0) { WARN_ON_ONCE(!RB_EMPTY_NODE(&entry->rbnode)); zswap_entry_free(entry); } From a4a9d1ffefbb68b09050284788de93921ec9ab15 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 29 Jan 2024 20:36:42 -0500 Subject: [PATCH 526/707] mm: zswap: rename __zswap_load() to zswap_decompress() Link: https://lkml.kernel.org/r/20240130014208.565554-7-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Nhat Pham Acked-by: Yosry Ahmed Reviewed-by: Chengming Zhou Signed-off-by: Andrew Morton --- mm/zswap.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 7a7e8da2b4f8c3..bdc9f82fe4b90d 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1316,7 +1316,7 @@ static int zswap_enabled_param_set(const char *val, return ret; } -static void __zswap_load(struct zswap_entry *entry, struct page *page) +static void zswap_decompress(struct zswap_entry *entry, struct page *page) { struct zpool *zpool = zswap_find_zpool(entry); struct scatterlist input, output; @@ -1411,7 +1411,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry, zswap_entry_get(entry); spin_unlock(&tree->lock); - __zswap_load(entry, &folio->page); + zswap_decompress(entry, &folio->page); count_vm_event(ZSWPWB); if (entry->objcg) @@ -1702,7 +1702,7 @@ bool zswap_load(struct folio *folio) spin_unlock(&tree->lock); if (entry->length) - __zswap_load(entry, page); + zswap_decompress(entry, page); else { dst = kmap_local_page(page); zswap_fill_page(dst, entry->value); From 6db85df064f864e16eefdca06c475a340a257cce Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 29 Jan 2024 20:36:43 -0500 Subject: [PATCH 527/707] mm: zswap: break out zwap_compress() zswap_store() is long and mixes work at the zswap layer with work at the backend and compression layer. Move compression & backend work to zswap_compress(), mirroring zswap_decompress(). Link: https://lkml.kernel.org/r/20240130014208.565554-8-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Nhat Pham Acked-by: Yosry Ahmed Reviewed-by: Chengming Zhou Signed-off-by: Andrew Morton --- mm/zswap.c | 145 ++++++++++++++++++++++++++++------------------------- 1 file changed, 77 insertions(+), 68 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index bdc9f82fe4b90d..f9b9494156ba60 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1316,6 +1316,79 @@ static int zswap_enabled_param_set(const char *val, return ret; } +static bool zswap_compress(struct folio *folio, struct zswap_entry *entry) +{ + struct crypto_acomp_ctx *acomp_ctx; + struct scatterlist input, output; + unsigned int dlen = PAGE_SIZE; + unsigned long handle; + struct zpool *zpool; + char *buf; + gfp_t gfp; + int ret; + u8 *dst; + + acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); + + mutex_lock(&acomp_ctx->mutex); + + dst = acomp_ctx->buffer; + sg_init_table(&input, 1); + sg_set_page(&input, &folio->page, PAGE_SIZE, 0); + + /* + * We need PAGE_SIZE * 2 here since there maybe over-compression case, + * and hardware-accelerators may won't check the dst buffer size, so + * giving the dst buffer with enough length to avoid buffer overflow. + */ + sg_init_one(&output, dst, PAGE_SIZE * 2); + acomp_request_set_params(acomp_ctx->req, &input, &output, PAGE_SIZE, dlen); + + /* + * it maybe looks a little bit silly that we send an asynchronous request, + * then wait for its completion synchronously. This makes the process look + * synchronous in fact. + * Theoretically, acomp supports users send multiple acomp requests in one + * acomp instance, then get those requests done simultaneously. but in this + * case, zswap actually does store and load page by page, there is no + * existing method to send the second page before the first page is done + * in one thread doing zwap. + * but in different threads running on different cpu, we have different + * acomp instance, so multiple threads can do (de)compression in parallel. + */ + ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req), &acomp_ctx->wait); + dlen = acomp_ctx->req->dlen; + if (ret) { + zswap_reject_compress_fail++; + goto unlock; + } + + zpool = zswap_find_zpool(entry); + gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; + if (zpool_malloc_support_movable(zpool)) + gfp |= __GFP_HIGHMEM | __GFP_MOVABLE; + ret = zpool_malloc(zpool, dlen, gfp, &handle); + if (ret == -ENOSPC) { + zswap_reject_compress_poor++; + goto unlock; + } + if (ret) { + zswap_reject_alloc_fail++; + goto unlock; + } + + buf = zpool_map_handle(zpool, handle, ZPOOL_MM_WO); + memcpy(buf, dst, dlen); + zpool_unmap_handle(zpool, handle); + + entry->handle = handle; + entry->length = dlen; + +unlock: + mutex_unlock(&acomp_ctx->mutex); + return ret == 0; +} + static void zswap_decompress(struct zswap_entry *entry, struct page *page) { struct zpool *zpool = zswap_find_zpool(entry); @@ -1472,18 +1545,11 @@ bool zswap_store(struct folio *folio) struct page *page = &folio->page; struct zswap_tree *tree = swap_zswap_tree(swp); struct zswap_entry *entry, *dupentry; - struct scatterlist input, output; - struct crypto_acomp_ctx *acomp_ctx; struct obj_cgroup *objcg = NULL; struct mem_cgroup *memcg = NULL; struct zswap_pool *pool; - struct zpool *zpool; - unsigned int dlen = PAGE_SIZE; - unsigned long handle, value; - char *buf; - u8 *src, *dst; - gfp_t gfp; - int ret; + unsigned long value; + u8 *src; VM_WARN_ON_ONCE(!folio_test_locked(folio)); VM_WARN_ON_ONCE(!folio_test_swapcache(folio)); @@ -1568,65 +1634,10 @@ bool zswap_store(struct folio *folio) mem_cgroup_put(memcg); } - /* compress */ - acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); - - mutex_lock(&acomp_ctx->mutex); - - dst = acomp_ctx->buffer; - sg_init_table(&input, 1); - sg_set_page(&input, &folio->page, PAGE_SIZE, 0); + if (!zswap_compress(folio, entry)) + goto put_pool; - /* - * We need PAGE_SIZE * 2 here since there maybe over-compression case, - * and hardware-accelerators may won't check the dst buffer size, so - * giving the dst buffer with enough length to avoid buffer overflow. - */ - sg_init_one(&output, dst, PAGE_SIZE * 2); - acomp_request_set_params(acomp_ctx->req, &input, &output, PAGE_SIZE, dlen); - /* - * it maybe looks a little bit silly that we send an asynchronous request, - * then wait for its completion synchronously. This makes the process look - * synchronous in fact. - * Theoretically, acomp supports users send multiple acomp requests in one - * acomp instance, then get those requests done simultaneously. but in this - * case, zswap actually does store and load page by page, there is no - * existing method to send the second page before the first page is done - * in one thread doing zwap. - * but in different threads running on different cpu, we have different - * acomp instance, so multiple threads can do (de)compression in parallel. - */ - ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req), &acomp_ctx->wait); - dlen = acomp_ctx->req->dlen; - - if (ret) { - zswap_reject_compress_fail++; - goto put_dstmem; - } - - /* store */ - zpool = zswap_find_zpool(entry); - gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; - if (zpool_malloc_support_movable(zpool)) - gfp |= __GFP_HIGHMEM | __GFP_MOVABLE; - ret = zpool_malloc(zpool, dlen, gfp, &handle); - if (ret == -ENOSPC) { - zswap_reject_compress_poor++; - goto put_dstmem; - } - if (ret) { - zswap_reject_alloc_fail++; - goto put_dstmem; - } - buf = zpool_map_handle(zpool, handle, ZPOOL_MM_WO); - memcpy(buf, dst, dlen); - zpool_unmap_handle(zpool, handle); - mutex_unlock(&acomp_ctx->mutex); - - /* populate entry */ entry->swpentry = swp; - entry->handle = handle; - entry->length = dlen; insert_entry: entry->objcg = objcg; @@ -1663,8 +1674,6 @@ bool zswap_store(struct folio *folio) return true; -put_dstmem: - mutex_unlock(&acomp_ctx->mutex); put_pool: zswap_pool_put(entry->pool); freepage: From b1a2a0ed5d831cd9bb755e170260626f2fa43223 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 29 Jan 2024 20:36:44 -0500 Subject: [PATCH 528/707] mm: zswap: further cleanup zswap_store() - Remove dupentry, reusing entry works just fine. - Rename pool to shrink_pool, as this one actually is confusing. - Remove page, use folio_nid() and kmap_local_folio() directly. - Set entry->swpentry in a common path. - Move value and src to local scope of use. Link: https://lkml.kernel.org/r/20240130014208.565554-9-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Nhat Pham Acked-by: Yosry Ahmed Reviewed-by: Chengming Zhou Signed-off-by: Andrew Morton --- mm/zswap.c | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index f9b9494156ba60..cde309c539b337 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1542,14 +1542,11 @@ bool zswap_store(struct folio *folio) { swp_entry_t swp = folio->swap; pgoff_t offset = swp_offset(swp); - struct page *page = &folio->page; struct zswap_tree *tree = swap_zswap_tree(swp); struct zswap_entry *entry, *dupentry; struct obj_cgroup *objcg = NULL; struct mem_cgroup *memcg = NULL; - struct zswap_pool *pool; - unsigned long value; - u8 *src; + struct zswap_pool *shrink_pool; VM_WARN_ON_ONCE(!folio_test_locked(folio)); VM_WARN_ON_ONCE(!folio_test_swapcache(folio)); @@ -1567,10 +1564,10 @@ bool zswap_store(struct folio *folio) * the tree, and it might be written back overriding the new data. */ spin_lock(&tree->lock); - dupentry = zswap_rb_search(&tree->rbroot, offset); - if (dupentry) { + entry = zswap_rb_search(&tree->rbroot, offset); + if (entry) { + zswap_invalidate_entry(tree, entry); zswap_duplicate_entry++; - zswap_invalidate_entry(tree, dupentry); } spin_unlock(&tree->lock); objcg = get_obj_cgroup_from_folio(folio); @@ -1598,17 +1595,19 @@ bool zswap_store(struct folio *folio) } /* allocate entry */ - entry = zswap_entry_cache_alloc(GFP_KERNEL, page_to_nid(page)); + entry = zswap_entry_cache_alloc(GFP_KERNEL, folio_nid(folio)); if (!entry) { zswap_reject_kmemcache_fail++; goto reject; } if (zswap_same_filled_pages_enabled) { - src = kmap_local_page(page); + unsigned long value; + u8 *src; + + src = kmap_local_folio(folio, 0); if (zswap_is_page_same_filled(src, &value)) { kunmap_local(src); - entry->swpentry = swp; entry->length = 0; entry->value = value; atomic_inc(&zswap_same_filled_pages); @@ -1637,9 +1636,8 @@ bool zswap_store(struct folio *folio) if (!zswap_compress(folio, entry)) goto put_pool; - entry->swpentry = swp; - insert_entry: + entry->swpentry = swp; entry->objcg = objcg; if (objcg) { obj_cgroup_charge_zswap(objcg, entry->length); @@ -1684,9 +1682,9 @@ bool zswap_store(struct folio *folio) return false; shrink: - pool = zswap_pool_last_get(); - if (pool && !queue_work(shrink_wq, &pool->shrink_work)) - zswap_pool_put(pool); + shrink_pool = zswap_pool_last_get(); + if (shrink_pool && !queue_work(shrink_wq, &shrink_pool->shrink_work)) + zswap_pool_put(shrink_pool); goto reject; } From c9070092b658e578e5b6e80ed8b68ec327162237 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 29 Jan 2024 20:36:45 -0500 Subject: [PATCH 529/707] mm: zswap: simplify zswap_invalidate() The branching is awkward and duplicates code. The comment about writeback is also misleading: yes, the entry might have been written back. Or it might have never been stored in zswap to begin with due to a rejection - zswap_invalidate() is called on all exiting swap entries. Link: https://lkml.kernel.org/r/20240130014208.565554-10-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Nhat Pham Acked-by: Yosry Ahmed Reviewed-by: Chengming Zhou Signed-off-by: Andrew Morton --- mm/zswap.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index cde309c539b337..082d076a758d19 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1739,15 +1739,10 @@ void zswap_invalidate(int type, pgoff_t offset) struct zswap_tree *tree = swap_zswap_tree(swp_entry(type, offset)); struct zswap_entry *entry; - /* find */ spin_lock(&tree->lock); entry = zswap_rb_search(&tree->rbroot, offset); - if (!entry) { - /* entry was written back */ - spin_unlock(&tree->lock); - return; - } - zswap_invalidate_entry(tree, entry); + if (entry) + zswap_invalidate_entry(tree, entry); spin_unlock(&tree->lock); } From 0858aef97827e36254120e075ab42919cf761e2a Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 29 Jan 2024 20:36:46 -0500 Subject: [PATCH 530/707] mm: zswap: function ordering: pool alloc & free The function ordering in zswap.c is a little chaotic, which requires jumping in unexpected directions when following related code. This is a series of patches that brings the file into the following order: - pool functions - lru functions - rbtree functions - zswap entry functions - compression/backend functions - writeback & shrinking functions - store, load, invalidate, swapon, swapoff - debugfs - init But it has to be split up such the moving still produces halfway readable diffs. In this patch, move pool allocation and freeing functions. Link: https://lkml.kernel.org/r/20240130014208.565554-11-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Nhat Pham Cc: Chengming Zhou Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- mm/zswap.c | 297 +++++++++++++++++++++++++++-------------------------- 1 file changed, 152 insertions(+), 145 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 082d076a758d19..805d9a35f63387 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -320,6 +320,158 @@ static void zswap_update_total_size(void) zswap_pool_total_size = total; } +/********************************* +* pool functions +**********************************/ + +static void zswap_alloc_shrinker(struct zswap_pool *pool); +static void shrink_worker(struct work_struct *w); + +static struct zswap_pool *zswap_pool_create(char *type, char *compressor) +{ + int i; + struct zswap_pool *pool; + char name[38]; /* 'zswap' + 32 char (max) num + \0 */ + gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; + int ret; + + if (!zswap_has_pool) { + /* if either are unset, pool initialization failed, and we + * need both params to be set correctly before trying to + * create a pool. + */ + if (!strcmp(type, ZSWAP_PARAM_UNSET)) + return NULL; + if (!strcmp(compressor, ZSWAP_PARAM_UNSET)) + return NULL; + } + + pool = kzalloc(sizeof(*pool), GFP_KERNEL); + if (!pool) + return NULL; + + for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) { + /* unique name for each pool specifically required by zsmalloc */ + snprintf(name, 38, "zswap%x", + atomic_inc_return(&zswap_pools_count)); + + pool->zpools[i] = zpool_create_pool(type, name, gfp); + if (!pool->zpools[i]) { + pr_err("%s zpool not available\n", type); + goto error; + } + } + pr_debug("using %s zpool\n", zpool_get_type(pool->zpools[0])); + + strscpy(pool->tfm_name, compressor, sizeof(pool->tfm_name)); + + pool->acomp_ctx = alloc_percpu(*pool->acomp_ctx); + if (!pool->acomp_ctx) { + pr_err("percpu alloc failed\n"); + goto error; + } + + ret = cpuhp_state_add_instance(CPUHP_MM_ZSWP_POOL_PREPARE, + &pool->node); + if (ret) + goto error; + + zswap_alloc_shrinker(pool); + if (!pool->shrinker) + goto error; + + pr_debug("using %s compressor\n", pool->tfm_name); + + /* being the current pool takes 1 ref; this func expects the + * caller to always add the new pool as the current pool + */ + kref_init(&pool->kref); + INIT_LIST_HEAD(&pool->list); + if (list_lru_init_memcg(&pool->list_lru, pool->shrinker)) + goto lru_fail; + shrinker_register(pool->shrinker); + INIT_WORK(&pool->shrink_work, shrink_worker); + atomic_set(&pool->nr_stored, 0); + + zswap_pool_debug("created", pool); + + return pool; + +lru_fail: + list_lru_destroy(&pool->list_lru); + shrinker_free(pool->shrinker); +error: + if (pool->acomp_ctx) + free_percpu(pool->acomp_ctx); + while (i--) + zpool_destroy_pool(pool->zpools[i]); + kfree(pool); + return NULL; +} + +static struct zswap_pool *__zswap_pool_create_fallback(void) +{ + bool has_comp, has_zpool; + + has_comp = crypto_has_acomp(zswap_compressor, 0, 0); + if (!has_comp && strcmp(zswap_compressor, + CONFIG_ZSWAP_COMPRESSOR_DEFAULT)) { + pr_err("compressor %s not available, using default %s\n", + zswap_compressor, CONFIG_ZSWAP_COMPRESSOR_DEFAULT); + param_free_charp(&zswap_compressor); + zswap_compressor = CONFIG_ZSWAP_COMPRESSOR_DEFAULT; + has_comp = crypto_has_acomp(zswap_compressor, 0, 0); + } + if (!has_comp) { + pr_err("default compressor %s not available\n", + zswap_compressor); + param_free_charp(&zswap_compressor); + zswap_compressor = ZSWAP_PARAM_UNSET; + } + + has_zpool = zpool_has_pool(zswap_zpool_type); + if (!has_zpool && strcmp(zswap_zpool_type, + CONFIG_ZSWAP_ZPOOL_DEFAULT)) { + pr_err("zpool %s not available, using default %s\n", + zswap_zpool_type, CONFIG_ZSWAP_ZPOOL_DEFAULT); + param_free_charp(&zswap_zpool_type); + zswap_zpool_type = CONFIG_ZSWAP_ZPOOL_DEFAULT; + has_zpool = zpool_has_pool(zswap_zpool_type); + } + if (!has_zpool) { + pr_err("default zpool %s not available\n", + zswap_zpool_type); + param_free_charp(&zswap_zpool_type); + zswap_zpool_type = ZSWAP_PARAM_UNSET; + } + + if (!has_comp || !has_zpool) + return NULL; + + return zswap_pool_create(zswap_zpool_type, zswap_compressor); +} + +static void zswap_pool_destroy(struct zswap_pool *pool) +{ + int i; + + zswap_pool_debug("destroying", pool); + + shrinker_free(pool->shrinker); + cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node); + free_percpu(pool->acomp_ctx); + list_lru_destroy(&pool->list_lru); + + spin_lock(&zswap_pools_lock); + mem_cgroup_iter_break(NULL, pool->next_shrink); + pool->next_shrink = NULL; + spin_unlock(&zswap_pools_lock); + + for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) + zpool_destroy_pool(pool->zpools[i]); + kfree(pool); +} + /* should be called under RCU */ #ifdef CONFIG_MEMCG static inline struct mem_cgroup *mem_cgroup_from_entry(struct zswap_entry *entry) @@ -970,151 +1122,6 @@ static void shrink_worker(struct work_struct *w) zswap_pool_put(pool); } -static struct zswap_pool *zswap_pool_create(char *type, char *compressor) -{ - int i; - struct zswap_pool *pool; - char name[38]; /* 'zswap' + 32 char (max) num + \0 */ - gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; - int ret; - - if (!zswap_has_pool) { - /* if either are unset, pool initialization failed, and we - * need both params to be set correctly before trying to - * create a pool. - */ - if (!strcmp(type, ZSWAP_PARAM_UNSET)) - return NULL; - if (!strcmp(compressor, ZSWAP_PARAM_UNSET)) - return NULL; - } - - pool = kzalloc(sizeof(*pool), GFP_KERNEL); - if (!pool) - return NULL; - - for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) { - /* unique name for each pool specifically required by zsmalloc */ - snprintf(name, 38, "zswap%x", - atomic_inc_return(&zswap_pools_count)); - - pool->zpools[i] = zpool_create_pool(type, name, gfp); - if (!pool->zpools[i]) { - pr_err("%s zpool not available\n", type); - goto error; - } - } - pr_debug("using %s zpool\n", zpool_get_type(pool->zpools[0])); - - strscpy(pool->tfm_name, compressor, sizeof(pool->tfm_name)); - - pool->acomp_ctx = alloc_percpu(*pool->acomp_ctx); - if (!pool->acomp_ctx) { - pr_err("percpu alloc failed\n"); - goto error; - } - - ret = cpuhp_state_add_instance(CPUHP_MM_ZSWP_POOL_PREPARE, - &pool->node); - if (ret) - goto error; - - zswap_alloc_shrinker(pool); - if (!pool->shrinker) - goto error; - - pr_debug("using %s compressor\n", pool->tfm_name); - - /* being the current pool takes 1 ref; this func expects the - * caller to always add the new pool as the current pool - */ - kref_init(&pool->kref); - INIT_LIST_HEAD(&pool->list); - if (list_lru_init_memcg(&pool->list_lru, pool->shrinker)) - goto lru_fail; - shrinker_register(pool->shrinker); - INIT_WORK(&pool->shrink_work, shrink_worker); - atomic_set(&pool->nr_stored, 0); - - zswap_pool_debug("created", pool); - - return pool; - -lru_fail: - list_lru_destroy(&pool->list_lru); - shrinker_free(pool->shrinker); -error: - if (pool->acomp_ctx) - free_percpu(pool->acomp_ctx); - while (i--) - zpool_destroy_pool(pool->zpools[i]); - kfree(pool); - return NULL; -} - -static struct zswap_pool *__zswap_pool_create_fallback(void) -{ - bool has_comp, has_zpool; - - has_comp = crypto_has_acomp(zswap_compressor, 0, 0); - if (!has_comp && strcmp(zswap_compressor, - CONFIG_ZSWAP_COMPRESSOR_DEFAULT)) { - pr_err("compressor %s not available, using default %s\n", - zswap_compressor, CONFIG_ZSWAP_COMPRESSOR_DEFAULT); - param_free_charp(&zswap_compressor); - zswap_compressor = CONFIG_ZSWAP_COMPRESSOR_DEFAULT; - has_comp = crypto_has_acomp(zswap_compressor, 0, 0); - } - if (!has_comp) { - pr_err("default compressor %s not available\n", - zswap_compressor); - param_free_charp(&zswap_compressor); - zswap_compressor = ZSWAP_PARAM_UNSET; - } - - has_zpool = zpool_has_pool(zswap_zpool_type); - if (!has_zpool && strcmp(zswap_zpool_type, - CONFIG_ZSWAP_ZPOOL_DEFAULT)) { - pr_err("zpool %s not available, using default %s\n", - zswap_zpool_type, CONFIG_ZSWAP_ZPOOL_DEFAULT); - param_free_charp(&zswap_zpool_type); - zswap_zpool_type = CONFIG_ZSWAP_ZPOOL_DEFAULT; - has_zpool = zpool_has_pool(zswap_zpool_type); - } - if (!has_zpool) { - pr_err("default zpool %s not available\n", - zswap_zpool_type); - param_free_charp(&zswap_zpool_type); - zswap_zpool_type = ZSWAP_PARAM_UNSET; - } - - if (!has_comp || !has_zpool) - return NULL; - - return zswap_pool_create(zswap_zpool_type, zswap_compressor); -} - -static void zswap_pool_destroy(struct zswap_pool *pool) -{ - int i; - - zswap_pool_debug("destroying", pool); - - shrinker_free(pool->shrinker); - cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node); - free_percpu(pool->acomp_ctx); - list_lru_destroy(&pool->list_lru); - - spin_lock(&zswap_pools_lock); - mem_cgroup_iter_break(NULL, pool->next_shrink); - pool->next_shrink = NULL; - spin_unlock(&zswap_pools_lock); - - for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) - zpool_destroy_pool(pool->zpools[i]); - kfree(pool); -} - static int __must_check zswap_pool_get(struct zswap_pool *pool) { if (!pool) From 8f600872a7ff9350681364463805cafed07020a5 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 29 Jan 2024 20:36:47 -0500 Subject: [PATCH 531/707] mm: zswap: function ordering: pool refcounting Move pool refcounting functions into the pool section. First the destroy functions, then the get and put which uses them. __zswap_pool_empty() has an upward reference to the global zswap_pools, to sanity check it's not the currently active pool that's being freed. That gets the forward decl for zswap_pool_current(). This puts the get and put function above all callers, so kill the forward decls as well. Link: https://lkml.kernel.org/r/20240130014208.565554-12-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Cc: Chengming Zhou Cc: Nhat Pham Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- mm/zswap.c | 94 +++++++++++++++++++++++++++--------------------------- 1 file changed, 47 insertions(+), 47 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 805d9a35f63387..33775f2224b711 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -278,8 +278,6 @@ static inline struct zswap_tree *swap_zswap_tree(swp_entry_t swp) static int zswap_writeback_entry(struct zswap_entry *entry, swp_entry_t swpentry); -static int zswap_pool_get(struct zswap_pool *pool); -static void zswap_pool_put(struct zswap_pool *pool); static bool zswap_is_full(void) { @@ -472,6 +470,53 @@ static void zswap_pool_destroy(struct zswap_pool *pool) kfree(pool); } +static void __zswap_pool_release(struct work_struct *work) +{ + struct zswap_pool *pool = container_of(work, typeof(*pool), + release_work); + + synchronize_rcu(); + + /* nobody should have been able to get a kref... */ + WARN_ON(kref_get_unless_zero(&pool->kref)); + + /* pool is now off zswap_pools list and has no references. */ + zswap_pool_destroy(pool); +} + +static struct zswap_pool *zswap_pool_current(void); + +static void __zswap_pool_empty(struct kref *kref) +{ + struct zswap_pool *pool; + + pool = container_of(kref, typeof(*pool), kref); + + spin_lock(&zswap_pools_lock); + + WARN_ON(pool == zswap_pool_current()); + + list_del_rcu(&pool->list); + + INIT_WORK(&pool->release_work, __zswap_pool_release); + schedule_work(&pool->release_work); + + spin_unlock(&zswap_pools_lock); +} + +static int __must_check zswap_pool_get(struct zswap_pool *pool) +{ + if (!pool) + return 0; + + return kref_get_unless_zero(&pool->kref); +} + +static void zswap_pool_put(struct zswap_pool *pool) +{ + kref_put(&pool->kref, __zswap_pool_empty); +} + /* should be called under RCU */ #ifdef CONFIG_MEMCG static inline struct mem_cgroup *mem_cgroup_from_entry(struct zswap_entry *entry) @@ -1122,51 +1167,6 @@ static void shrink_worker(struct work_struct *w) zswap_pool_put(pool); } -static int __must_check zswap_pool_get(struct zswap_pool *pool) -{ - if (!pool) - return 0; - - return kref_get_unless_zero(&pool->kref); -} - -static void __zswap_pool_release(struct work_struct *work) -{ - struct zswap_pool *pool = container_of(work, typeof(*pool), - release_work); - - synchronize_rcu(); - - /* nobody should have been able to get a kref... */ - WARN_ON(kref_get_unless_zero(&pool->kref)); - - /* pool is now off zswap_pools list and has no references. */ - zswap_pool_destroy(pool); -} - -static void __zswap_pool_empty(struct kref *kref) -{ - struct zswap_pool *pool; - - pool = container_of(kref, typeof(*pool), kref); - - spin_lock(&zswap_pools_lock); - - WARN_ON(pool == zswap_pool_current()); - - list_del_rcu(&pool->list); - - INIT_WORK(&pool->release_work, __zswap_pool_release); - schedule_work(&pool->release_work); - - spin_unlock(&zswap_pools_lock); -} - -static void zswap_pool_put(struct zswap_pool *pool) -{ - kref_put(&pool->kref, __zswap_pool_empty); -} - /********************************* * param callbacks **********************************/ From 06b2ee23c95c6655586eba48fb25750e0ebd7878 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 29 Jan 2024 20:36:48 -0500 Subject: [PATCH 532/707] mm: zswap: function ordering: zswap_pools Move the operations against the global zswap_pools list (current pool, last, find) to the pool section. Link: https://lkml.kernel.org/r/20240130014208.565554-13-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Nhat Pham Cc: Chengming Zhou Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- mm/zswap.c | 150 ++++++++++++++++++++++++++--------------------------- 1 file changed, 73 insertions(+), 77 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 33775f2224b711..168afd6767b3be 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -517,6 +517,79 @@ static void zswap_pool_put(struct zswap_pool *pool) kref_put(&pool->kref, __zswap_pool_empty); } +static struct zswap_pool *__zswap_pool_current(void) +{ + struct zswap_pool *pool; + + pool = list_first_or_null_rcu(&zswap_pools, typeof(*pool), list); + WARN_ONCE(!pool && zswap_has_pool, + "%s: no page storage pool!\n", __func__); + + return pool; +} + +static struct zswap_pool *zswap_pool_current(void) +{ + assert_spin_locked(&zswap_pools_lock); + + return __zswap_pool_current(); +} + +static struct zswap_pool *zswap_pool_current_get(void) +{ + struct zswap_pool *pool; + + rcu_read_lock(); + + pool = __zswap_pool_current(); + if (!zswap_pool_get(pool)) + pool = NULL; + + rcu_read_unlock(); + + return pool; +} + +static struct zswap_pool *zswap_pool_last_get(void) +{ + struct zswap_pool *pool, *last = NULL; + + rcu_read_lock(); + + list_for_each_entry_rcu(pool, &zswap_pools, list) + last = pool; + WARN_ONCE(!last && zswap_has_pool, + "%s: no page storage pool!\n", __func__); + if (!zswap_pool_get(last)) + last = NULL; + + rcu_read_unlock(); + + return last; +} + +/* type and compressor must be null-terminated */ +static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor) +{ + struct zswap_pool *pool; + + assert_spin_locked(&zswap_pools_lock); + + list_for_each_entry_rcu(pool, &zswap_pools, list) { + if (strcmp(pool->tfm_name, compressor)) + continue; + /* all zpools share the same type */ + if (strcmp(zpool_get_type(pool->zpools[0]), type)) + continue; + /* if we can't get it, it's about to be destroyed */ + if (!zswap_pool_get(pool)) + continue; + return pool; + } + + return NULL; +} + /* should be called under RCU */ #ifdef CONFIG_MEMCG static inline struct mem_cgroup *mem_cgroup_from_entry(struct zswap_entry *entry) @@ -938,83 +1011,6 @@ static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node) return 0; } -/********************************* -* pool functions -**********************************/ - -static struct zswap_pool *__zswap_pool_current(void) -{ - struct zswap_pool *pool; - - pool = list_first_or_null_rcu(&zswap_pools, typeof(*pool), list); - WARN_ONCE(!pool && zswap_has_pool, - "%s: no page storage pool!\n", __func__); - - return pool; -} - -static struct zswap_pool *zswap_pool_current(void) -{ - assert_spin_locked(&zswap_pools_lock); - - return __zswap_pool_current(); -} - -static struct zswap_pool *zswap_pool_current_get(void) -{ - struct zswap_pool *pool; - - rcu_read_lock(); - - pool = __zswap_pool_current(); - if (!zswap_pool_get(pool)) - pool = NULL; - - rcu_read_unlock(); - - return pool; -} - -static struct zswap_pool *zswap_pool_last_get(void) -{ - struct zswap_pool *pool, *last = NULL; - - rcu_read_lock(); - - list_for_each_entry_rcu(pool, &zswap_pools, list) - last = pool; - WARN_ONCE(!last && zswap_has_pool, - "%s: no page storage pool!\n", __func__); - if (!zswap_pool_get(last)) - last = NULL; - - rcu_read_unlock(); - - return last; -} - -/* type and compressor must be null-terminated */ -static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor) -{ - struct zswap_pool *pool; - - assert_spin_locked(&zswap_pools_lock); - - list_for_each_entry_rcu(pool, &zswap_pools, list) { - if (strcmp(pool->tfm_name, compressor)) - continue; - /* all zpools share the same type */ - if (strcmp(zpool_get_type(pool->zpools[0]), type)) - continue; - /* if we can't get it, it's about to be destroyed */ - if (!zswap_pool_get(pool)) - continue; - return pool; - } - - return NULL; -} - static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l, spinlock_t *lock, void *arg) { From 034672cfecbd4ea82e8f0ea57f03d9023a100fb4 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 29 Jan 2024 20:36:49 -0500 Subject: [PATCH 533/707] mm: zswap: function ordering: pool params Patch series "mm: zswap: cleanups". Cleanups and maintenance items that accumulated while reviewing zswap patches. This patch (of 20): The parameters primarily control pool attributes. Move those operations up to the pool section. Link: https://lkml.kernel.org/r/20240130014208.565554-1-hannes@cmpxchg.org Link: https://lkml.kernel.org/r/20240130014208.565554-14-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Nhat Pham Cc: Chengming Zhou Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- mm/zswap.c | 312 ++++++++++++++++++++++++++--------------------------- 1 file changed, 156 insertions(+), 156 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 168afd6767b3be..e650fc58711662 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -590,6 +590,162 @@ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor) return NULL; } +/********************************* +* param callbacks +**********************************/ + +static bool zswap_pool_changed(const char *s, const struct kernel_param *kp) +{ + /* no change required */ + if (!strcmp(s, *(char **)kp->arg) && zswap_has_pool) + return false; + return true; +} + +/* val must be a null-terminated string */ +static int __zswap_param_set(const char *val, const struct kernel_param *kp, + char *type, char *compressor) +{ + struct zswap_pool *pool, *put_pool = NULL; + char *s = strstrip((char *)val); + int ret = 0; + bool new_pool = false; + + mutex_lock(&zswap_init_lock); + switch (zswap_init_state) { + case ZSWAP_UNINIT: + /* if this is load-time (pre-init) param setting, + * don't create a pool; that's done during init. + */ + ret = param_set_charp(s, kp); + break; + case ZSWAP_INIT_SUCCEED: + new_pool = zswap_pool_changed(s, kp); + break; + case ZSWAP_INIT_FAILED: + pr_err("can't set param, initialization failed\n"); + ret = -ENODEV; + } + mutex_unlock(&zswap_init_lock); + + /* no need to create a new pool, return directly */ + if (!new_pool) + return ret; + + if (!type) { + if (!zpool_has_pool(s)) { + pr_err("zpool %s not available\n", s); + return -ENOENT; + } + type = s; + } else if (!compressor) { + if (!crypto_has_acomp(s, 0, 0)) { + pr_err("compressor %s not available\n", s); + return -ENOENT; + } + compressor = s; + } else { + WARN_ON(1); + return -EINVAL; + } + + spin_lock(&zswap_pools_lock); + + pool = zswap_pool_find_get(type, compressor); + if (pool) { + zswap_pool_debug("using existing", pool); + WARN_ON(pool == zswap_pool_current()); + list_del_rcu(&pool->list); + } + + spin_unlock(&zswap_pools_lock); + + if (!pool) + pool = zswap_pool_create(type, compressor); + + if (pool) + ret = param_set_charp(s, kp); + else + ret = -EINVAL; + + spin_lock(&zswap_pools_lock); + + if (!ret) { + put_pool = zswap_pool_current(); + list_add_rcu(&pool->list, &zswap_pools); + zswap_has_pool = true; + } else if (pool) { + /* add the possibly pre-existing pool to the end of the pools + * list; if it's new (and empty) then it'll be removed and + * destroyed by the put after we drop the lock + */ + list_add_tail_rcu(&pool->list, &zswap_pools); + put_pool = pool; + } + + spin_unlock(&zswap_pools_lock); + + if (!zswap_has_pool && !pool) { + /* if initial pool creation failed, and this pool creation also + * failed, maybe both compressor and zpool params were bad. + * Allow changing this param, so pool creation will succeed + * when the other param is changed. We already verified this + * param is ok in the zpool_has_pool() or crypto_has_acomp() + * checks above. + */ + ret = param_set_charp(s, kp); + } + + /* drop the ref from either the old current pool, + * or the new pool we failed to add + */ + if (put_pool) + zswap_pool_put(put_pool); + + return ret; +} + +static int zswap_compressor_param_set(const char *val, + const struct kernel_param *kp) +{ + return __zswap_param_set(val, kp, zswap_zpool_type, NULL); +} + +static int zswap_zpool_param_set(const char *val, + const struct kernel_param *kp) +{ + return __zswap_param_set(val, kp, NULL, zswap_compressor); +} + +static int zswap_enabled_param_set(const char *val, + const struct kernel_param *kp) +{ + int ret = -ENODEV; + + /* if this is load-time (pre-init) param setting, only set param. */ + if (system_state != SYSTEM_RUNNING) + return param_set_bool(val, kp); + + mutex_lock(&zswap_init_lock); + switch (zswap_init_state) { + case ZSWAP_UNINIT: + if (zswap_setup()) + break; + fallthrough; + case ZSWAP_INIT_SUCCEED: + if (!zswap_has_pool) + pr_err("can't enable, no pool configured\n"); + else + ret = param_set_bool(val, kp); + break; + case ZSWAP_INIT_FAILED: + pr_err("can't enable, initialization failed\n"); + } + mutex_unlock(&zswap_init_lock); + + return ret; +} + /* should be called under RCU */ #ifdef CONFIG_MEMCG static inline struct mem_cgroup *mem_cgroup_from_entry(struct zswap_entry *entry) @@ -1163,162 +1319,6 @@ static void shrink_worker(struct work_struct *w) zswap_pool_put(pool); } -/********************************* -* param callbacks -**********************************/ - -static bool zswap_pool_changed(const char *s, const struct kernel_param *kp) -{ - /* no change required */ - if (!strcmp(s, *(char **)kp->arg) && zswap_has_pool) - return false; - return true; -} - -/* val must be a null-terminated string */ -static int __zswap_param_set(const char *val, const struct kernel_param *kp, - char *type, char *compressor) -{ - struct zswap_pool *pool, *put_pool = NULL; - char *s = strstrip((char *)val); - int ret = 0; - bool new_pool = false; - - mutex_lock(&zswap_init_lock); - switch (zswap_init_state) { - case ZSWAP_UNINIT: - /* if this is load-time (pre-init) param setting, - * don't create a pool; that's done during init. - */ - ret = param_set_charp(s, kp); - break; - case ZSWAP_INIT_SUCCEED: - new_pool = zswap_pool_changed(s, kp); - break; - case ZSWAP_INIT_FAILED: - pr_err("can't set param, initialization failed\n"); - ret = -ENODEV; - } - mutex_unlock(&zswap_init_lock); - - /* no need to create a new pool, return directly */ - if (!new_pool) - return ret; - - if (!type) { - if (!zpool_has_pool(s)) { - pr_err("zpool %s not available\n", s); - return -ENOENT; - } - type = s; - } else if (!compressor) { - if (!crypto_has_acomp(s, 0, 0)) { - pr_err("compressor %s not available\n", s); - return -ENOENT; - } - compressor = s; - } else { - WARN_ON(1); - return -EINVAL; - } - - spin_lock(&zswap_pools_lock); - - pool = zswap_pool_find_get(type, compressor); - if (pool) { - zswap_pool_debug("using existing", pool); - WARN_ON(pool == zswap_pool_current()); - list_del_rcu(&pool->list); - } - - spin_unlock(&zswap_pools_lock); - - if (!pool) - pool = zswap_pool_create(type, compressor); - - if (pool) - ret = param_set_charp(s, kp); - else - ret = -EINVAL; - - spin_lock(&zswap_pools_lock); - - if (!ret) { - put_pool = zswap_pool_current(); - list_add_rcu(&pool->list, &zswap_pools); - zswap_has_pool = true; - } else if (pool) { - /* add the possibly pre-existing pool to the end of the pools - * list; if it's new (and empty) then it'll be removed and - * destroyed by the put after we drop the lock - */ - list_add_tail_rcu(&pool->list, &zswap_pools); - put_pool = pool; - } - - spin_unlock(&zswap_pools_lock); - - if (!zswap_has_pool && !pool) { - /* if initial pool creation failed, and this pool creation also - * failed, maybe both compressor and zpool params were bad. - * Allow changing this param, so pool creation will succeed - * when the other param is changed. We already verified this - * param is ok in the zpool_has_pool() or crypto_has_acomp() - * checks above. - */ - ret = param_set_charp(s, kp); - } - - /* drop the ref from either the old current pool, - * or the new pool we failed to add - */ - if (put_pool) - zswap_pool_put(put_pool); - - return ret; -} - -static int zswap_compressor_param_set(const char *val, - const struct kernel_param *kp) -{ - return __zswap_param_set(val, kp, zswap_zpool_type, NULL); -} - -static int zswap_zpool_param_set(const char *val, - const struct kernel_param *kp) -{ - return __zswap_param_set(val, kp, NULL, zswap_compressor); -} - -static int zswap_enabled_param_set(const char *val, - const struct kernel_param *kp) -{ - int ret = -ENODEV; - - /* if this is load-time (pre-init) param setting, only set param. */ - if (system_state != SYSTEM_RUNNING) - return param_set_bool(val, kp); - - mutex_lock(&zswap_init_lock); - switch (zswap_init_state) { - case ZSWAP_UNINIT: - if (zswap_setup()) - break; - fallthrough; - case ZSWAP_INIT_SUCCEED: - if (!zswap_has_pool) - pr_err("can't enable, no pool configured\n"); - else - ret = param_set_bool(val, kp); - break; - case ZSWAP_INIT_FAILED: - pr_err("can't enable, initialization failed\n"); - } - mutex_unlock(&zswap_init_lock); - - return ret; -} - static bool zswap_compress(struct folio *folio, struct zswap_entry *entry) { struct crypto_acomp_ctx *acomp_ctx; From 3bcc879d25478db7730e98216c3ab0bcac10b3e6 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 29 Jan 2024 20:36:50 -0500 Subject: [PATCH 534/707] mm: zswap: function ordering: public lru api The zswap entry section sits awkwardly in the middle of LRU-related functions. Group the external LRU API functions first. Link: https://lkml.kernel.org/r/20240130014208.565554-15-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Nhat Pham Cc: Chengming Zhou Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- mm/zswap.c | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index e650fc58711662..511bfafc1456d0 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -746,6 +746,10 @@ static int zswap_enabled_param_set(const char *val, return ret; } +/********************************* +* lru functions +**********************************/ + /* should be called under RCU */ #ifdef CONFIG_MEMCG static inline struct mem_cgroup *mem_cgroup_from_entry(struct zswap_entry *entry) @@ -764,6 +768,21 @@ static inline int entry_to_nid(struct zswap_entry *entry) return page_to_nid(virt_to_page(entry)); } +void zswap_lruvec_state_init(struct lruvec *lruvec) +{ + atomic_long_set(&lruvec->zswap_lruvec_state.nr_zswap_protected, 0); +} + +void zswap_folio_swapin(struct folio *folio) +{ + struct lruvec *lruvec; + + if (folio) { + lruvec = folio_lruvec(folio); + atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected); + } +} + void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg) { struct zswap_pool *pool; @@ -798,24 +817,6 @@ static void zswap_entry_cache_free(struct zswap_entry *entry) kmem_cache_free(zswap_entry_cache, entry); } -/********************************* -* zswap lruvec functions -**********************************/ -void zswap_lruvec_state_init(struct lruvec *lruvec) -{ - atomic_long_set(&lruvec->zswap_lruvec_state.nr_zswap_protected, 0); -} - -void zswap_folio_swapin(struct folio *folio) -{ - struct lruvec *lruvec; - - if (folio) { - lruvec = folio_lruvec(folio); - atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected); - } -} - /********************************* * lru functions **********************************/ From e9c7d9d6bda18627e3ac3557d587b73e6cca8518 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 29 Jan 2024 20:36:51 -0500 Subject: [PATCH 535/707] mm: zswap: function ordering: move entry sections out of LRU section This completes consolidation of the LRU section. Link: https://lkml.kernel.org/r/20240130014208.565554-16-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Nhat Pham Cc: Chengming Zhou Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- mm/zswap.c | 101 ++++++++++++++++++++++++++--------------------------- 1 file changed, 49 insertions(+), 52 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 511bfafc1456d0..756d4d575efef2 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -768,58 +768,6 @@ static inline int entry_to_nid(struct zswap_entry *entry) return page_to_nid(virt_to_page(entry)); } -void zswap_lruvec_state_init(struct lruvec *lruvec) -{ - atomic_long_set(&lruvec->zswap_lruvec_state.nr_zswap_protected, 0); -} - -void zswap_folio_swapin(struct folio *folio) -{ - struct lruvec *lruvec; - - if (folio) { - lruvec = folio_lruvec(folio); - atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected); - } -} - -void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg) -{ - struct zswap_pool *pool; - - /* lock out zswap pools list modification */ - spin_lock(&zswap_pools_lock); - list_for_each_entry(pool, &zswap_pools, list) { - if (pool->next_shrink == memcg) - pool->next_shrink = mem_cgroup_iter(NULL, pool->next_shrink, NULL); - } - spin_unlock(&zswap_pools_lock); -} - -/********************************* -* zswap entry functions -**********************************/ -static struct kmem_cache *zswap_entry_cache; - -static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp, int nid) -{ - struct zswap_entry *entry; - entry = kmem_cache_alloc_node(zswap_entry_cache, gfp, nid); - if (!entry) - return NULL; - entry->refcount = 1; - RB_CLEAR_NODE(&entry->rbnode); - return entry; -} - -static void zswap_entry_cache_free(struct zswap_entry *entry) -{ - kmem_cache_free(zswap_entry_cache, entry); -} - -/********************************* -* lru functions -**********************************/ static void zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry) { atomic_long_t *nr_zswap_protected; @@ -872,6 +820,55 @@ static void zswap_lru_del(struct list_lru *list_lru, struct zswap_entry *entry) rcu_read_unlock(); } +void zswap_lruvec_state_init(struct lruvec *lruvec) +{ + atomic_long_set(&lruvec->zswap_lruvec_state.nr_zswap_protected, 0); +} + +void zswap_folio_swapin(struct folio *folio) +{ + struct lruvec *lruvec; + + if (folio) { + lruvec = folio_lruvec(folio); + atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected); + } +} + +void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg) +{ + struct zswap_pool *pool; + + /* lock out zswap pools list modification */ + spin_lock(&zswap_pools_lock); + list_for_each_entry(pool, &zswap_pools, list) { + if (pool->next_shrink == memcg) + pool->next_shrink = mem_cgroup_iter(NULL, pool->next_shrink, NULL); + } + spin_unlock(&zswap_pools_lock); +} + +/********************************* +* zswap entry functions +**********************************/ +static struct kmem_cache *zswap_entry_cache; + +static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp, int nid) +{ + struct zswap_entry *entry; + entry = kmem_cache_alloc_node(zswap_entry_cache, gfp, nid); + if (!entry) + return NULL; + entry->refcount = 1; + RB_CLEAR_NODE(&entry->rbnode); + return entry; +} + +static void zswap_entry_cache_free(struct zswap_entry *entry) +{ + kmem_cache_free(zswap_entry_cache, entry); +} + /********************************* * rbtree functions **********************************/ From f74d04c807e343d4e82450922067c23a4e383426 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 29 Jan 2024 20:36:52 -0500 Subject: [PATCH 536/707] mm: zswap: function ordering: move entry section out of tree section The higher-level entry operations modify the tree, so move the entry API after the tree section. Link: https://lkml.kernel.org/r/20240130014208.565554-17-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Cc: Chengming Zhou Cc: Nhat Pham Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- mm/zswap.c | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 756d4d575efef2..80adc2f7d1a2b8 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -848,27 +848,6 @@ void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg) spin_unlock(&zswap_pools_lock); } -/********************************* -* zswap entry functions -**********************************/ -static struct kmem_cache *zswap_entry_cache; - -static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp, int nid) -{ - struct zswap_entry *entry; - entry = kmem_cache_alloc_node(zswap_entry_cache, gfp, nid); - if (!entry) - return NULL; - entry->refcount = 1; - RB_CLEAR_NODE(&entry->rbnode); - return entry; -} - -static void zswap_entry_cache_free(struct zswap_entry *entry) -{ - kmem_cache_free(zswap_entry_cache, entry); -} - /********************************* * rbtree functions **********************************/ @@ -930,6 +909,27 @@ static bool zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry) return false; } +/********************************* +* zswap entry functions +**********************************/ +static struct kmem_cache *zswap_entry_cache; + +static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp, int nid) +{ + struct zswap_entry *entry; + entry = kmem_cache_alloc_node(zswap_entry_cache, gfp, nid); + if (!entry) + return NULL; + entry->refcount = 1; + RB_CLEAR_NODE(&entry->rbnode); + return entry; +} + +static void zswap_entry_cache_free(struct zswap_entry *entry) +{ + kmem_cache_free(zswap_entry_cache, entry); +} + static struct zpool *zswap_find_zpool(struct zswap_entry *entry) { int i = 0; From 326b395ae838f4e793336a752053e73069ed4fdc Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 29 Jan 2024 20:36:53 -0500 Subject: [PATCH 537/707] mm: zswap: function ordering: compress & decompress functions Writeback needs to decompress. Move the (de)compression API above what will be the consolidated shrinking/writeback code. Link: https://lkml.kernel.org/r/20240130014208.565554-18-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Cc: Chengming Zhou Cc: Nhat Pham Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- mm/zswap.c | 207 +++++++++++++++++++++++++++-------------------------- 1 file changed, 105 insertions(+), 102 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 80adc2f7d1a2b8..17356b2e35c2cd 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -992,6 +992,111 @@ static void zswap_invalidate_entry(struct zswap_tree *tree, zswap_entry_put(entry); } +/********************************* +* compressed storage functions +**********************************/ +static bool zswap_compress(struct folio *folio, struct zswap_entry *entry) +{ + struct crypto_acomp_ctx *acomp_ctx; + struct scatterlist input, output; + unsigned int dlen = PAGE_SIZE; + unsigned long handle; + struct zpool *zpool; + char *buf; + gfp_t gfp; + int ret; + u8 *dst; + + acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); + + mutex_lock(&acomp_ctx->mutex); + + dst = acomp_ctx->buffer; + sg_init_table(&input, 1); + sg_set_page(&input, &folio->page, PAGE_SIZE, 0); + + /* + * We need PAGE_SIZE * 2 here since there maybe over-compression case, + * and hardware-accelerators may won't check the dst buffer size, so + * giving the dst buffer with enough length to avoid buffer overflow. + */ + sg_init_one(&output, dst, PAGE_SIZE * 2); + acomp_request_set_params(acomp_ctx->req, &input, &output, PAGE_SIZE, dlen); + + /* + * it maybe looks a little bit silly that we send an asynchronous request, + * then wait for its completion synchronously. This makes the process look + * synchronous in fact. + * Theoretically, acomp supports users send multiple acomp requests in one + * acomp instance, then get those requests done simultaneously. but in this + * case, zswap actually does store and load page by page, there is no + * existing method to send the second page before the first page is done + * in one thread doing zwap. + * but in different threads running on different cpu, we have different + * acomp instance, so multiple threads can do (de)compression in parallel. + */ + ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req), &acomp_ctx->wait); + dlen = acomp_ctx->req->dlen; + if (ret) { + zswap_reject_compress_fail++; + goto unlock; + } + + zpool = zswap_find_zpool(entry); + gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; + if (zpool_malloc_support_movable(zpool)) + gfp |= __GFP_HIGHMEM | __GFP_MOVABLE; + ret = zpool_malloc(zpool, dlen, gfp, &handle); + if (ret == -ENOSPC) { + zswap_reject_compress_poor++; + goto unlock; + } + if (ret) { + zswap_reject_alloc_fail++; + goto unlock; + } + + buf = zpool_map_handle(zpool, handle, ZPOOL_MM_WO); + memcpy(buf, dst, dlen); + zpool_unmap_handle(zpool, handle); + + entry->handle = handle; + entry->length = dlen; + +unlock: + mutex_unlock(&acomp_ctx->mutex); + return ret == 0; +} + +static void zswap_decompress(struct zswap_entry *entry, struct page *page) +{ + struct zpool *zpool = zswap_find_zpool(entry); + struct scatterlist input, output; + struct crypto_acomp_ctx *acomp_ctx; + u8 *src; + + acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); + mutex_lock(&acomp_ctx->mutex); + + src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO); + if (!zpool_can_sleep_mapped(zpool)) { + memcpy(acomp_ctx->buffer, src, entry->length); + src = acomp_ctx->buffer; + zpool_unmap_handle(zpool, entry->handle); + } + + sg_init_one(&input, src, entry->length); + sg_init_table(&output, 1); + sg_set_page(&output, page, PAGE_SIZE, 0); + acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, PAGE_SIZE); + BUG_ON(crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait)); + BUG_ON(acomp_ctx->req->dlen != PAGE_SIZE); + mutex_unlock(&acomp_ctx->mutex); + + if (zpool_can_sleep_mapped(zpool)) + zpool_unmap_handle(zpool, entry->handle); +} + /********************************* * shrinker functions **********************************/ @@ -1317,108 +1422,6 @@ static void shrink_worker(struct work_struct *w) zswap_pool_put(pool); } -static bool zswap_compress(struct folio *folio, struct zswap_entry *entry) -{ - struct crypto_acomp_ctx *acomp_ctx; - struct scatterlist input, output; - unsigned int dlen = PAGE_SIZE; - unsigned long handle; - struct zpool *zpool; - char *buf; - gfp_t gfp; - int ret; - u8 *dst; - - acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); - - mutex_lock(&acomp_ctx->mutex); - - dst = acomp_ctx->buffer; - sg_init_table(&input, 1); - sg_set_page(&input, &folio->page, PAGE_SIZE, 0); - - /* - * We need PAGE_SIZE * 2 here since there maybe over-compression case, - * and hardware-accelerators may won't check the dst buffer size, so - * giving the dst buffer with enough length to avoid buffer overflow. - */ - sg_init_one(&output, dst, PAGE_SIZE * 2); - acomp_request_set_params(acomp_ctx->req, &input, &output, PAGE_SIZE, dlen); - - /* - * it maybe looks a little bit silly that we send an asynchronous request, - * then wait for its completion synchronously. This makes the process look - * synchronous in fact. - * Theoretically, acomp supports users send multiple acomp requests in one - * acomp instance, then get those requests done simultaneously. but in this - * case, zswap actually does store and load page by page, there is no - * existing method to send the second page before the first page is done - * in one thread doing zwap. - * but in different threads running on different cpu, we have different - * acomp instance, so multiple threads can do (de)compression in parallel. - */ - ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req), &acomp_ctx->wait); - dlen = acomp_ctx->req->dlen; - if (ret) { - zswap_reject_compress_fail++; - goto unlock; - } - - zpool = zswap_find_zpool(entry); - gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; - if (zpool_malloc_support_movable(zpool)) - gfp |= __GFP_HIGHMEM | __GFP_MOVABLE; - ret = zpool_malloc(zpool, dlen, gfp, &handle); - if (ret == -ENOSPC) { - zswap_reject_compress_poor++; - goto unlock; - } - if (ret) { - zswap_reject_alloc_fail++; - goto unlock; - } - - buf = zpool_map_handle(zpool, handle, ZPOOL_MM_WO); - memcpy(buf, dst, dlen); - zpool_unmap_handle(zpool, handle); - - entry->handle = handle; - entry->length = dlen; - -unlock: - mutex_unlock(&acomp_ctx->mutex); - return ret == 0; -} - -static void zswap_decompress(struct zswap_entry *entry, struct page *page) -{ - struct zpool *zpool = zswap_find_zpool(entry); - struct scatterlist input, output; - struct crypto_acomp_ctx *acomp_ctx; - u8 *src; - - acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); - mutex_lock(&acomp_ctx->mutex); - - src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO); - if (!zpool_can_sleep_mapped(zpool)) { - memcpy(acomp_ctx->buffer, src, entry->length); - src = acomp_ctx->buffer; - zpool_unmap_handle(zpool, entry->handle); - } - - sg_init_one(&input, src, entry->length); - sg_init_table(&output, 1); - sg_set_page(&output, page, PAGE_SIZE, 0); - acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, PAGE_SIZE); - BUG_ON(crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait)); - BUG_ON(acomp_ctx->req->dlen != PAGE_SIZE); - mutex_unlock(&acomp_ctx->mutex); - - if (zpool_can_sleep_mapped(zpool)) - zpool_unmap_handle(zpool, entry->handle); -} - /********************************* * writeback code **********************************/ From fe8415052d45c7eb5be6844a163b9dbcceaa408b Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 29 Jan 2024 20:36:54 -0500 Subject: [PATCH 538/707] mm: zswap: function ordering: per-cpu compression infra The per-cpu compression init/exit callbacks are awkwardly in the middle of the shrinker code. Move them up to the compression section. Link: https://lkml.kernel.org/r/20240130014208.565554-19-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Cc: Chengming Zhou Cc: Nhat Pham Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- mm/zswap.c | 135 ++++++++++++++++++++++++++--------------------------- 1 file changed, 66 insertions(+), 69 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 17356b2e35c2cd..acd7dcd1e0f2de 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -995,6 +995,72 @@ static void zswap_invalidate_entry(struct zswap_tree *tree, /********************************* * compressed storage functions **********************************/ +static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node) +{ + struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node); + struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu); + struct crypto_acomp *acomp; + struct acomp_req *req; + int ret; + + mutex_init(&acomp_ctx->mutex); + + acomp_ctx->buffer = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu)); + if (!acomp_ctx->buffer) + return -ENOMEM; + + acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu)); + if (IS_ERR(acomp)) { + pr_err("could not alloc crypto acomp %s : %ld\n", + pool->tfm_name, PTR_ERR(acomp)); + ret = PTR_ERR(acomp); + goto acomp_fail; + } + acomp_ctx->acomp = acomp; + + req = acomp_request_alloc(acomp_ctx->acomp); + if (!req) { + pr_err("could not alloc crypto acomp_request %s\n", + pool->tfm_name); + ret = -ENOMEM; + goto req_fail; + } + acomp_ctx->req = req; + + crypto_init_wait(&acomp_ctx->wait); + /* + * if the backend of acomp is async zip, crypto_req_done() will wakeup + * crypto_wait_req(); if the backend of acomp is scomp, the callback + * won't be called, crypto_wait_req() will return without blocking. + */ + acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG, + crypto_req_done, &acomp_ctx->wait); + + return 0; + +req_fail: + crypto_free_acomp(acomp_ctx->acomp); +acomp_fail: + kfree(acomp_ctx->buffer); + return ret; +} + +static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node) +{ + struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node); + struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu); + + if (!IS_ERR_OR_NULL(acomp_ctx)) { + if (!IS_ERR_OR_NULL(acomp_ctx->req)) + acomp_request_free(acomp_ctx->req); + if (!IS_ERR_OR_NULL(acomp_ctx->acomp)) + crypto_free_acomp(acomp_ctx->acomp); + kfree(acomp_ctx->buffer); + } + + return 0; +} + static bool zswap_compress(struct folio *folio, struct zswap_entry *entry) { struct crypto_acomp_ctx *acomp_ctx; @@ -1201,75 +1267,6 @@ static void zswap_alloc_shrinker(struct zswap_pool *pool) pool->shrinker->seeks = DEFAULT_SEEKS; } -/********************************* -* per-cpu code -**********************************/ -static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node) -{ - struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node); - struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu); - struct crypto_acomp *acomp; - struct acomp_req *req; - int ret; - - mutex_init(&acomp_ctx->mutex); - - acomp_ctx->buffer = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu)); - if (!acomp_ctx->buffer) - return -ENOMEM; - - acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu)); - if (IS_ERR(acomp)) { - pr_err("could not alloc crypto acomp %s : %ld\n", - pool->tfm_name, PTR_ERR(acomp)); - ret = PTR_ERR(acomp); - goto acomp_fail; - } - acomp_ctx->acomp = acomp; - - req = acomp_request_alloc(acomp_ctx->acomp); - if (!req) { - pr_err("could not alloc crypto acomp_request %s\n", - pool->tfm_name); - ret = -ENOMEM; - goto req_fail; - } - acomp_ctx->req = req; - - crypto_init_wait(&acomp_ctx->wait); - /* - * if the backend of acomp is async zip, crypto_req_done() will wakeup - * crypto_wait_req(); if the backend of acomp is scomp, the callback - * won't be called, crypto_wait_req() will return without blocking. - */ - acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG, - crypto_req_done, &acomp_ctx->wait); - - return 0; - -req_fail: - crypto_free_acomp(acomp_ctx->acomp); -acomp_fail: - kfree(acomp_ctx->buffer); - return ret; -} - -static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node) -{ - struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node); - struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu); - - if (!IS_ERR_OR_NULL(acomp_ctx)) { - if (!IS_ERR_OR_NULL(acomp_ctx->req)) - acomp_request_free(acomp_ctx->req); - if (!IS_ERR_OR_NULL(acomp_ctx->acomp)) - crypto_free_acomp(acomp_ctx->acomp); - kfree(acomp_ctx->buffer); - } - - return 0; -} - static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l, spinlock_t *lock, void *arg) { From 45e264a25d7c022125692602e17dbc20d663aaf3 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 29 Jan 2024 20:36:55 -0500 Subject: [PATCH 539/707] mm: zswap: function ordering: writeback Shrinking needs writeback. Naturally, move the writeback code above the shrinking code. Delete the forward decl. Link: https://lkml.kernel.org/r/20240130014208.565554-20-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Cc: Chengming Zhou Cc: Nhat Pham Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- mm/zswap.c | 183 ++++++++++++++++++++++++++--------------------------- 1 file changed, 90 insertions(+), 93 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index acd7dcd1e0f2de..0cb3437d47eba3 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -276,9 +276,6 @@ static inline struct zswap_tree *swap_zswap_tree(swp_entry_t swp) pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name, \ zpool_get_type((p)->zpools[0])) -static int zswap_writeback_entry(struct zswap_entry *entry, - swp_entry_t swpentry); - static bool zswap_is_full(void) { return totalram_pages() * zswap_max_pool_percent / 100 < @@ -1163,6 +1160,96 @@ static void zswap_decompress(struct zswap_entry *entry, struct page *page) zpool_unmap_handle(zpool, entry->handle); } +/********************************* +* writeback code +**********************************/ +/* + * Attempts to free an entry by adding a folio to the swap cache, + * decompressing the entry data into the folio, and issuing a + * bio write to write the folio back to the swap device. + * + * This can be thought of as a "resumed writeback" of the folio + * to the swap device. We are basically resuming the same swap + * writeback path that was intercepted with the zswap_store() + * in the first place. After the folio has been decompressed into + * the swap cache, the compressed version stored by zswap can be + * freed. + */ +static int zswap_writeback_entry(struct zswap_entry *entry, + swp_entry_t swpentry) +{ + struct zswap_tree *tree; + struct folio *folio; + struct mempolicy *mpol; + bool folio_was_allocated; + struct writeback_control wbc = { + .sync_mode = WB_SYNC_NONE, + }; + + /* try to allocate swap cache folio */ + mpol = get_task_policy(current); + folio = __read_swap_cache_async(swpentry, GFP_KERNEL, mpol, + NO_INTERLEAVE_INDEX, &folio_was_allocated, true); + if (!folio) + return -ENOMEM; + + /* + * Found an existing folio, we raced with swapin or concurrent + * shrinker. We generally writeback cold folios from zswap, and + * swapin means the folio just became hot, so skip this folio. + * For unlikely concurrent shrinker case, it will be unlinked + * and freed when invalidated by the concurrent shrinker anyway. + */ + if (!folio_was_allocated) { + folio_put(folio); + return -EEXIST; + } + + /* + * folio is locked, and the swapcache is now secured against + * concurrent swapping to and from the slot. Verify that the + * swap entry hasn't been invalidated and recycled behind our + * backs (our zswap_entry reference doesn't prevent that), to + * avoid overwriting a new swap folio with old compressed data. + */ + tree = swap_zswap_tree(swpentry); + spin_lock(&tree->lock); + if (zswap_rb_search(&tree->rbroot, swp_offset(swpentry)) != entry) { + spin_unlock(&tree->lock); + delete_from_swap_cache(folio); + folio_unlock(folio); + folio_put(folio); + return -ENOMEM; + } + + /* Safe to deref entry after the entry is verified above. */ + zswap_entry_get(entry); + spin_unlock(&tree->lock); + + zswap_decompress(entry, &folio->page); + + count_vm_event(ZSWPWB); + if (entry->objcg) + count_objcg_event(entry->objcg, ZSWPWB); + + spin_lock(&tree->lock); + zswap_invalidate_entry(tree, entry); + zswap_entry_put(entry); + spin_unlock(&tree->lock); + + /* folio is up to date */ + folio_mark_uptodate(folio); + + /* move it to the tail of the inactive list after end_writeback */ + folio_set_reclaim(folio); + + /* start writeback */ + __swap_writepage(folio, &wbc); + folio_put(folio); + + return 0; +} + /********************************* * shrinker functions **********************************/ @@ -1419,96 +1506,6 @@ static void shrink_worker(struct work_struct *w) zswap_pool_put(pool); } -/********************************* -* writeback code -**********************************/ -/* - * Attempts to free an entry by adding a folio to the swap cache, - * decompressing the entry data into the folio, and issuing a - * bio write to write the folio back to the swap device. - * - * This can be thought of as a "resumed writeback" of the folio - * to the swap device. We are basically resuming the same swap - * writeback path that was intercepted with the zswap_store() - * in the first place. After the folio has been decompressed into - * the swap cache, the compressed version stored by zswap can be - * freed. - */ -static int zswap_writeback_entry(struct zswap_entry *entry, - swp_entry_t swpentry) -{ - struct zswap_tree *tree; - struct folio *folio; - struct mempolicy *mpol; - bool folio_was_allocated; - struct writeback_control wbc = { - .sync_mode = WB_SYNC_NONE, - }; - - /* try to allocate swap cache folio */ - mpol = get_task_policy(current); - folio = __read_swap_cache_async(swpentry, GFP_KERNEL, mpol, - NO_INTERLEAVE_INDEX, &folio_was_allocated, true); - if (!folio) - return -ENOMEM; - - /* - * Found an existing folio, we raced with swapin or concurrent - * shrinker. We generally writeback cold folios from zswap, and - * swapin means the folio just became hot, so skip this folio. - * For unlikely concurrent shrinker case, it will be unlinked - * and freed when invalidated by the concurrent shrinker anyway. - */ - if (!folio_was_allocated) { - folio_put(folio); - return -EEXIST; - } - - /* - * folio is locked, and the swapcache is now secured against - * concurrent swapping to and from the slot. Verify that the - * swap entry hasn't been invalidated and recycled behind our - * backs (our zswap_entry reference doesn't prevent that), to - * avoid overwriting a new swap folio with old compressed data. - */ - tree = swap_zswap_tree(swpentry); - spin_lock(&tree->lock); - if (zswap_rb_search(&tree->rbroot, swp_offset(swpentry)) != entry) { - spin_unlock(&tree->lock); - delete_from_swap_cache(folio); - folio_unlock(folio); - folio_put(folio); - return -ENOMEM; - } - - /* Safe to deref entry after the entry is verified above. */ - zswap_entry_get(entry); - spin_unlock(&tree->lock); - - zswap_decompress(entry, &folio->page); - - count_vm_event(ZSWPWB); - if (entry->objcg) - count_objcg_event(entry->objcg, ZSWPWB); - - spin_lock(&tree->lock); - zswap_invalidate_entry(tree, entry); - zswap_entry_put(entry); - spin_unlock(&tree->lock); - - /* folio is up to date */ - folio_mark_uptodate(folio); - - /* move it to the tail of the inactive list after end_writeback */ - folio_set_reclaim(folio); - - /* start writeback */ - __swap_writepage(folio, &wbc); - folio_put(folio); - - return 0; -} - static int zswap_is_page_same_filled(void *ptr, unsigned long *value) { unsigned long *page; From 7f4e0cf40b8d440755a3c038c4c4c123fe778223 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 29 Jan 2024 20:36:56 -0500 Subject: [PATCH 540/707] mm: zswap: function ordering: shrink_memcg_cb shrink_memcg_cb() is called by the shrinker and is based on zswap_writeback_entry(). Move it in between. Save one fwd decl. Link: https://lkml.kernel.org/r/20240130014208.565554-21-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Cc: Chengming Zhou Cc: Nhat Pham Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- mm/zswap.c | 125 ++++++++++++++++++++++++++--------------------------- 1 file changed, 61 insertions(+), 64 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 0cb3437d47eba3..4aea03285532b7 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1254,7 +1254,67 @@ static int zswap_writeback_entry(struct zswap_entry *entry, * shrinker functions **********************************/ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l, - spinlock_t *lock, void *arg); + spinlock_t *lock, void *arg) +{ + struct zswap_entry *entry = container_of(item, struct zswap_entry, lru); + bool *encountered_page_in_swapcache = (bool *)arg; + swp_entry_t swpentry; + enum lru_status ret = LRU_REMOVED_RETRY; + int writeback_result; + + /* + * Rotate the entry to the tail before unlocking the LRU, + * so that in case of an invalidation race concurrent + * reclaimers don't waste their time on it. + * + * If writeback succeeds, or failure is due to the entry + * being invalidated by the swap subsystem, the invalidation + * will unlink and free it. + * + * Temporary failures, where the same entry should be tried + * again immediately, almost never happen for this shrinker. + * We don't do any trylocking; -ENOMEM comes closest, + * but that's extremely rare and doesn't happen spuriously + * either. Don't bother distinguishing this case. + * + * But since they do exist in theory, the entry cannot just + * be unlinked, or we could leak it. Hence, rotate. + */ + list_move_tail(item, &l->list); + + /* + * Once the lru lock is dropped, the entry might get freed. The + * swpentry is copied to the stack, and entry isn't deref'd again + * until the entry is verified to still be alive in the tree. + */ + swpentry = entry->swpentry; + + /* + * It's safe to drop the lock here because we return either + * LRU_REMOVED_RETRY or LRU_RETRY. + */ + spin_unlock(lock); + + writeback_result = zswap_writeback_entry(entry, swpentry); + + if (writeback_result) { + zswap_reject_reclaim_fail++; + ret = LRU_RETRY; + + /* + * Encountering a page already in swap cache is a sign that we are shrinking + * into the warmer region. We should terminate shrinking (if we're in the dynamic + * shrinker context). + */ + if (writeback_result == -EEXIST && encountered_page_in_swapcache) + *encountered_page_in_swapcache = true; + } else { + zswap_written_back_pages++; + } + + spin_lock(lock); + return ret; +} static unsigned long zswap_shrinker_scan(struct shrinker *shrinker, struct shrink_control *sc) @@ -1354,69 +1414,6 @@ static void zswap_alloc_shrinker(struct zswap_pool *pool) pool->shrinker->seeks = DEFAULT_SEEKS; } -static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l, - spinlock_t *lock, void *arg) -{ - struct zswap_entry *entry = container_of(item, struct zswap_entry, lru); - bool *encountered_page_in_swapcache = (bool *)arg; - swp_entry_t swpentry; - enum lru_status ret = LRU_REMOVED_RETRY; - int writeback_result; - - /* - * Rotate the entry to the tail before unlocking the LRU, - * so that in case of an invalidation race concurrent - * reclaimers don't waste their time on it. - * - * If writeback succeeds, or failure is due to the entry - * being invalidated by the swap subsystem, the invalidation - * will unlink and free it. - * - * Temporary failures, where the same entry should be tried - * again immediately, almost never happen for this shrinker. - * We don't do any trylocking; -ENOMEM comes closest, - * but that's extremely rare and doesn't happen spuriously - * either. Don't bother distinguishing this case. - * - * But since they do exist in theory, the entry cannot just - * be unlinked, or we could leak it. Hence, rotate. - */ - list_move_tail(item, &l->list); - - /* - * Once the lru lock is dropped, the entry might get freed. The - * swpentry is copied to the stack, and entry isn't deref'd again - * until the entry is verified to still be alive in the tree. - */ - swpentry = entry->swpentry; - - /* - * It's safe to drop the lock here because we return either - * LRU_REMOVED_RETRY or LRU_RETRY. - */ - spin_unlock(lock); - - writeback_result = zswap_writeback_entry(entry, swpentry); - - if (writeback_result) { - zswap_reject_reclaim_fail++; - ret = LRU_RETRY; - - /* - * Encountering a page already in swap cache is a sign that we are shrinking - * into the warmer region. We should terminate shrinking (if we're in the dynamic - * shrinker context). - */ - if (writeback_result == -EEXIST && encountered_page_in_swapcache) - *encountered_page_in_swapcache = true; - } else { - zswap_written_back_pages++; - } - - spin_lock(lock); - return ret; -} - static int shrink_memcg(struct mem_cgroup *memcg) { struct zswap_pool *pool; From b4489b2e8ef5277bd8925547b3464886c1192ff8 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 29 Jan 2024 17:35:40 -0800 Subject: [PATCH 541/707] Docs/admin-guide/mm/damon/usage: use sysfs interface for tracepoints example Patch series "mm/damon: make DAMON debugfs interface deprecation unignorable". DAMON debugfs interface is deprecated in February 2023, by commit 5445fcbc4cda ("Docs/admin-guide/mm/damon/usage: add DAMON debugfs interface deprecation notice"). Make the fact unable to be easily ignored by removing an example usage from the document (patch 1), renaming the config (patch 2), adding a deprecation notice file to the debugfs directory (patches 3-5), and renaming the debugfs file that essnetial to be used for real use of DAMON (patches 6-9). This patch (of 9): DAMON tracepoints example on the DAMON usage document is using DAMON debugfs interface, which is deprecated. Use its alternative, DAMON sysfs interface. Link: https://lkml.kernel.org/r/20240130013549.89538-1-sj@kernel.org Link: https://lkml.kernel.org/r/20240130013549.89538-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: Alex Shi Cc: Hu Haowen <2023002089@link.tyut.edu.cn> Cc: Jonathan Corbet Cc: Shuah Khan Cc: Yanteng Si Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/damon/usage.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst index 9d23144bf98501..f2feabb4bd35c7 100644 --- a/Documentation/admin-guide/mm/damon/usage.rst +++ b/Documentation/admin-guide/mm/damon/usage.rst @@ -579,11 +579,11 @@ monitoring results recording. While the monitoring is turned on, you could record the tracepoint events and show results using tracepoint supporting tools like ``perf``. For example:: - # echo on > monitor_on + # echo on > kdamonds/0/state # perf record -e damon:damon_aggregated & # sleep 5 # kill 9 $(pidof perf) - # echo off > monitor_on + # echo off > kdamonds/0/state # perf script kdamond.0 46568 [027] 79357.842179: damon:damon_aggregated: target_id=0 nr_regions=11 122509119488-135708762112: 0 864 [...] From f85fa5fa90db6f65d92f6cb99c6bcb655898154b Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 29 Jan 2024 17:35:41 -0800 Subject: [PATCH 542/707] mm/damon: rename CONFIG_DAMON_DBGFS to DAMON_DBGFS_DEPRECATED DAMON debugfs interface is deprecated. The fact has documented by commit 5445fcbc4cda ("Docs/admin-guide/mm/damon/usage: add DAMON debugfs interface deprecation notice"). Commit 620932cd2852 ("mm/damon/dbgfs: print DAMON debugfs interface deprecation message") further started printing a warning message when users still use it. Many people don't read documentation or kernel log, though. Make the deprecation harder to be ignored using the approach of commit eb07c4f39c3e ("mm/slab: rename CONFIG_SLAB to CONFIG_SLAB_DEPRECATED"). 'make oldconfig' with 'CONFIG_DAMON_DBGFS=y' will get a new prompt with the explicit deprecation notice on the name. 'make olddefconfig' with 'CONFIG_DAMON_DBGFS=y' will result in not building DAMON debugfs interface. If there is a real user of DAMON debugfs interface, they will complain the change to the builder. Link: https://lkml.kernel.org/r/20240130013549.89538-3-sj@kernel.org Signed-off-by: SeongJae Park Cc: Alex Shi Cc: Hu Haowen <2023002089@link.tyut.edu.cn> Cc: Jonathan Corbet Cc: Shuah Khan Cc: Yanteng Si Signed-off-by: Andrew Morton --- mm/damon/Kconfig | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/mm/damon/Kconfig b/mm/damon/Kconfig index 29f43fbc2eff13..fecb8172410c54 100644 --- a/mm/damon/Kconfig +++ b/mm/damon/Kconfig @@ -71,7 +71,7 @@ config DAMON_SYSFS_KUNIT_TEST If unsure, say N. -config DAMON_DBGFS +config DAMON_DBGFS_DEPRECATED bool "DAMON debugfs interface (DEPRECATED!)" depends on DAMON_VADDR && DAMON_PADDR && DEBUG_FS help @@ -84,6 +84,11 @@ config DAMON_DBGFS (DAMON_SYSFS). If you depend on this and cannot move, please report your usecase to damon@lists.linux.dev and linux-mm@kvack.org. +config DAMON_DBGFS + bool + default y + depends on DAMON_DBGFS_DEPRECATED + config DAMON_DBGFS_KUNIT_TEST bool "Test for damon debugfs interface" if !KUNIT_ALL_TESTS depends on DAMON_DBGFS && KUNIT=y From adf9047adfff48aa1a17257f82f2f3f08eb0ccc9 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 29 Jan 2024 17:35:42 -0800 Subject: [PATCH 543/707] mm/damon/dbgfs: implement deprecation notice file Implement a read-only file for DAMON debugfs interface deprecation notice, to let users who manually read/write the DAMON debugfs files from their shell command line easily notice the fact. Link: https://lkml.kernel.org/r/20240130013549.89538-4-sj@kernel.org Signed-off-by: SeongJae Park Cc: Alex Shi Cc: Hu Haowen <2023002089@link.tyut.edu.cn> Cc: Jonathan Corbet Cc: Shuah Khan Cc: Yanteng Si Signed-off-by: Andrew Morton --- mm/damon/dbgfs.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c index 7dac24e69e3b95..fc6ece5a9f37cc 100644 --- a/mm/damon/dbgfs.c +++ b/mm/damon/dbgfs.c @@ -805,6 +805,18 @@ static void dbgfs_destroy_ctx(struct damon_ctx *ctx) damon_destroy_ctx(ctx); } +static ssize_t damon_dbgfs_deprecated_read(struct file *file, + char __user *buf, size_t count, loff_t *ppos) +{ + char kbuf[512] = "DAMON debugfs interface is deprecated, " + "so users should move to DAMON_SYSFS. If you cannot, " + "please report your usecase to damon@lists.linux.dev and " + "linux-mm@kvack.org.\n"; + int len = strnlen(kbuf, 1024); + + return simple_read_from_buffer(buf, count, ppos, kbuf, len); +} + /* * Make a context of @name and create a debugfs directory for it. * @@ -1056,6 +1068,10 @@ static int damon_dbgfs_static_file_open(struct inode *inode, struct file *file) return nonseekable_open(inode, file); } +static const struct file_operations deprecated_fops = { + .read = damon_dbgfs_deprecated_read, +}; + static const struct file_operations mk_contexts_fops = { .open = damon_dbgfs_static_file_open, .write = dbgfs_mk_context_write, @@ -1076,9 +1092,9 @@ static int __init __damon_dbgfs_init(void) { struct dentry *dbgfs_root; const char * const file_names[] = {"mk_contexts", "rm_contexts", - "monitor_on"}; + "monitor_on", "DEPRECATED"}; const struct file_operations *fops[] = {&mk_contexts_fops, - &rm_contexts_fops, &monitor_on_fops}; + &rm_contexts_fops, &monitor_on_fops, &deprecated_fops}; int i; dbgfs_root = debugfs_create_dir("damon", NULL); From 56c233831d70a2be9ddd5bf264e8db6def138f79 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 29 Jan 2024 17:35:43 -0800 Subject: [PATCH 544/707] mm/damon/dbgfs: make debugfs interface deprecation message a macro DAMON debugfs interface deprecation message is written twice, once for the warning, and again for DEPRECATED file's read output. De-duplicate those by defining the message as a macro and reuse. Link: https://lkml.kernel.org/r/20240130013549.89538-5-sj@kernel.org Signed-off-by: SeongJae Park Cc: Alex Shi Cc: Hu Haowen <2023002089@link.tyut.edu.cn> Cc: Jonathan Corbet Cc: Shuah Khan Cc: Yanteng Si Signed-off-by: Andrew Morton --- mm/damon/dbgfs.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c index fc6ece5a9f37cc..fbc0cd63f34c57 100644 --- a/mm/damon/dbgfs.c +++ b/mm/damon/dbgfs.c @@ -15,6 +15,11 @@ #include #include +#define DAMON_DBGFS_DEPRECATION_NOTICE \ + "DAMON debugfs interface is deprecated, so users should move " \ + "to DAMON_SYSFS. If you cannot, please report your usecase to " \ + "damon@lists.linux.dev and linux-mm@kvack.org.\n" + static struct damon_ctx **dbgfs_ctxs; static int dbgfs_nr_ctxs; static struct dentry **dbgfs_dirs; @@ -22,10 +27,7 @@ static DEFINE_MUTEX(damon_dbgfs_lock); static void damon_dbgfs_warn_deprecation(void) { - pr_warn_once("DAMON debugfs interface is deprecated, " - "so users should move to DAMON_SYSFS. If you cannot, " - "please report your usecase to damon@lists.linux.dev and " - "linux-mm@kvack.org.\n"); + pr_warn_once(DAMON_DBGFS_DEPRECATION_NOTICE); } /* @@ -808,10 +810,7 @@ static void dbgfs_destroy_ctx(struct damon_ctx *ctx) static ssize_t damon_dbgfs_deprecated_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { - char kbuf[512] = "DAMON debugfs interface is deprecated, " - "so users should move to DAMON_SYSFS. If you cannot, " - "please report your usecase to damon@lists.linux.dev and " - "linux-mm@kvack.org.\n"; + char kbuf[512] = DAMON_DBGFS_DEPRECATION_NOTICE; int len = strnlen(kbuf, 1024); return simple_read_from_buffer(buf, count, ppos, kbuf, len); From 1364bbee446ea919e5d3f2433ba7457a986dd061 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 29 Jan 2024 17:35:44 -0800 Subject: [PATCH 545/707] Docs/admin-guide/mm/damon/usage: document 'DEPRECATED' file of DAMON debugfs interface Document the newly added DAMON debugfs interface deprecation notice file on the usage document. Link: https://lkml.kernel.org/r/20240130013549.89538-6-sj@kernel.org Signed-off-by: SeongJae Park Cc: Alex Shi Cc: Hu Haowen <2023002089@link.tyut.edu.cn> Cc: Jonathan Corbet Cc: Shuah Khan Cc: Yanteng Si Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/damon/usage.rst | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst index f2feabb4bd35c7..5d3df18dfb9fc0 100644 --- a/Documentation/admin-guide/mm/damon/usage.rst +++ b/Documentation/admin-guide/mm/damon/usage.rst @@ -628,9 +628,16 @@ debugfs Interface (DEPRECATED!) move, please report your usecase to damon@lists.linux.dev and linux-mm@kvack.org. -DAMON exports eight files, ``attrs``, ``target_ids``, ``init_regions``, -``schemes``, ``monitor_on``, ``kdamond_pid``, ``mk_contexts`` and -``rm_contexts`` under its debugfs directory, ``/damon/``. +DAMON exports nine files, ``DEPRECATED``, ``attrs``, ``target_ids``, +``init_regions``, ``schemes``, ``monitor_on``, ``kdamond_pid``, ``mk_contexts`` +and ``rm_contexts`` under its debugfs directory, ``/damon/``. + + +``DEPRECATED`` is a read-only file for the DAMON debugfs interface deprecation +notice. Reading it returns the deprecation notice, as below:: + + # cat DEPRECATED + DAMON debugfs interface is deprecated, so users should move to DAMON_SYSFS. If you cannot, please report your usecase to damon@lists.linux.dev and linux-mm@kvack.org. Attributes From c79e7095868ed62e5843176d95517c7262d45c2f Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 29 Jan 2024 17:35:45 -0800 Subject: [PATCH 546/707] selftets/damon: prepare for monitor_on file renaming Following change will rename 'monitor_on' DAMON debugfs file to 'monitor_on_DEPRECATED', to make the deprecation unignorable in runtime. Since it could make DAMON selftests fail and disturb future bisects, update DAMON selftests to support the change. Link: https://lkml.kernel.org/r/20240130013549.89538-7-sj@kernel.org Signed-off-by: SeongJae Park Cc: Alex Shi Cc: Hu Haowen <2023002089@link.tyut.edu.cn> Cc: Jonathan Corbet Cc: Shuah Khan Cc: Yanteng Si Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/_chk_dependency.sh | 11 +++++++++-- tools/testing/selftests/damon/_debugfs_common.sh | 7 +++++++ .../testing/selftests/damon/debugfs_empty_targets.sh | 12 ++++++++++-- 3 files changed, 26 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/damon/_chk_dependency.sh b/tools/testing/selftests/damon/_chk_dependency.sh index 0328ac0b5a5ed0..350f8c2b071dbc 100644 --- a/tools/testing/selftests/damon/_chk_dependency.sh +++ b/tools/testing/selftests/damon/_chk_dependency.sh @@ -18,7 +18,14 @@ then exit $ksft_skip fi -for f in attrs target_ids monitor_on +if [ -f "$DBGFS/monitor_on_DEPRECATED" ] +then + monitor_on_file="monitor_on_DEPRECATED" +else + monitor_on_file="monitor_on" +fi + +for f in attrs target_ids "$monitor_on_file" do if [ ! -f "$DBGFS/$f" ] then @@ -28,7 +35,7 @@ do done permission_error="Operation not permitted" -for f in attrs target_ids monitor_on +for f in attrs target_ids "$monitor_on_file" do status=$( cat "$DBGFS/$f" 2>&1 ) if [ "${status#*$permission_error}" != "$status" ]; then diff --git a/tools/testing/selftests/damon/_debugfs_common.sh b/tools/testing/selftests/damon/_debugfs_common.sh index 48989d4813ae8b..aa995516870bc8 100644 --- a/tools/testing/selftests/damon/_debugfs_common.sh +++ b/tools/testing/selftests/damon/_debugfs_common.sh @@ -45,6 +45,13 @@ test_content() { source ./_chk_dependency.sh damon_onoff="$DBGFS/monitor_on" +if [ -f "$DBGFS/monitor_on_DEPRECATED" ] +then + damon_onoff="$DBGFS/monitor_on_DEPRECATED" +else + damon_onoff="$DBGFS/monitor_on" +fi + if [ $(cat "$damon_onoff") = "on" ] then echo "monitoring is on" diff --git a/tools/testing/selftests/damon/debugfs_empty_targets.sh b/tools/testing/selftests/damon/debugfs_empty_targets.sh index 87aff8083822f6..effbea33dc1640 100755 --- a/tools/testing/selftests/damon/debugfs_empty_targets.sh +++ b/tools/testing/selftests/damon/debugfs_empty_targets.sh @@ -8,6 +8,14 @@ source _debugfs_common.sh orig_target_ids=$(cat "$DBGFS/target_ids") echo "" > "$DBGFS/target_ids" -orig_monitor_on=$(cat "$DBGFS/monitor_on") -test_write_fail "$DBGFS/monitor_on" "on" "orig_monitor_on" "empty target ids" + +if [ -f "$DBGFS/monitor_on_DEPRECATED" ] +then + monitor_on_file="$DBGFS/monitor_on_DEPRECATED" +else + monitor_on_file="$DBGFS/monitor_on" +fi + +orig_monitor_on=$(cat "$monitor_on_file") +test_write_fail "$monitor_on_file" "on" "orig_monitor_on" "empty target ids" echo "$orig_target_ids" > "$DBGFS/target_ids" From d0c90dcc06509d5ee9dda96a7393d60343c63885 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 29 Jan 2024 17:35:46 -0800 Subject: [PATCH 547/707] mm/damon/dbgfs: rename monitor_on file to monitor_on_DEPRECATED Kernel builders could silently enable CONFIG_DAMON_DBGFS_DEPRECATED. Users who manually check the files under the DAMON debugfs directory could notice the deprecation owing to the 'DEPRECATED' DAMON debugfs file, but there could be users who doesn't manually check the files. Make the deprecation cannot be ignored in the case by renaming 'monitor_on' file, which is essential for real use of DAMON on runtime, to 'monitor_on_DEPRECATED'. Still users who control DAMON via only user-space tool could ignore the deprecation, but that's what the tool developers should take care of. DAMON user-space tool, damo, has also made a change[1] for the purpose. [1] commit 935dae76f2aee ("_damon_args: Rename --damon_interface to --damon_interface_DEPRECATED") of https://github.com/awslabs/damo Link: https://lkml.kernel.org/r/20240130013549.89538-8-sj@kernel.org Signed-off-by: SeongJae Park Cc: Alex Shi Cc: Hu Haowen <2023002089@link.tyut.edu.cn> Cc: Jonathan Corbet Cc: Shuah Khan Cc: Yanteng Si Signed-off-by: Andrew Morton --- mm/damon/dbgfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c index fbc0cd63f34c57..f7abbc0633aaf0 100644 --- a/mm/damon/dbgfs.c +++ b/mm/damon/dbgfs.c @@ -1091,7 +1091,7 @@ static int __init __damon_dbgfs_init(void) { struct dentry *dbgfs_root; const char * const file_names[] = {"mk_contexts", "rm_contexts", - "monitor_on", "DEPRECATED"}; + "monitor_on_DEPRECATED", "DEPRECATED"}; const struct file_operations *fops[] = {&mk_contexts_fops, &rm_contexts_fops, &monitor_on_fops, &deprecated_fops}; int i; From 58796385178b77697b9b8dee488e5e859bf716be Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 29 Jan 2024 17:35:47 -0800 Subject: [PATCH 548/707] Docs/admin-guide/mm/damon/usage: update for monitor_on renaming Update DAMON debugfs interface sections on the usage document to reflect the fact that 'monitor_on' file has renamed to 'monitor_on_DEPRECATED'. Link: https://lkml.kernel.org/r/20240130013549.89538-9-sj@kernel.org Signed-off-by: SeongJae Park Cc: Alex Shi Cc: Hu Haowen <2023002089@link.tyut.edu.cn> Cc: Jonathan Corbet Cc: Shuah Khan Cc: Yanteng Si Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/damon/usage.rst | 29 ++++++++++---------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst index 5d3df18dfb9fc0..58c34e66b31b2b 100644 --- a/Documentation/admin-guide/mm/damon/usage.rst +++ b/Documentation/admin-guide/mm/damon/usage.rst @@ -629,8 +629,9 @@ debugfs Interface (DEPRECATED!) linux-mm@kvack.org. DAMON exports nine files, ``DEPRECATED``, ``attrs``, ``target_ids``, -``init_regions``, ``schemes``, ``monitor_on``, ``kdamond_pid``, ``mk_contexts`` -and ``rm_contexts`` under its debugfs directory, ``/damon/``. +``init_regions``, ``schemes``, ``monitor_on_DEPRECATED``, ``kdamond_pid``, +``mk_contexts`` and ``rm_contexts`` under its debugfs directory, +``/damon/``. ``DEPRECATED`` is a read-only file for the DAMON debugfs interface deprecation @@ -855,16 +856,16 @@ Turning On/Off Setting the files as described above doesn't incur effect unless you explicitly start the monitoring. You can start, stop, and check the current status of the -monitoring by writing to and reading from the ``monitor_on`` file. Writing -``on`` to the file starts the monitoring of the targets with the attributes. -Writing ``off`` to the file stops those. DAMON also stops if every target -process is terminated. Below example commands turn on, off, and check the -status of DAMON:: +monitoring by writing to and reading from the ``monitor_on_DEPRECATED`` file. +Writing ``on`` to the file starts the monitoring of the targets with the +attributes. Writing ``off`` to the file stops those. DAMON also stops if +every target process is terminated. Below example commands turn on, off, and +check the status of DAMON:: # cd /damon - # echo on > monitor_on - # echo off > monitor_on - # cat monitor_on + # echo on > monitor_on_DEPRECATED + # echo off > monitor_on_DEPRECATED + # cat monitor_on_DEPRECATED off Please note that you cannot write to the above-mentioned debugfs files while @@ -880,11 +881,11 @@ can get the pid of the thread by reading the ``kdamond_pid`` file. When the monitoring is turned off, reading the file returns ``none``. :: # cd /damon - # cat monitor_on + # cat monitor_on_DEPRECATED off # cat kdamond_pid none - # echo on > monitor_on + # echo on > monitor_on_DEPRECATED # cat kdamond_pid 18594 @@ -914,5 +915,5 @@ directory by putting the name of the context to the ``rm_contexts`` file. :: # ls foo # ls: cannot access 'foo': No such file or directory -Note that ``mk_contexts``, ``rm_contexts``, and ``monitor_on`` files are in the -root directory only. +Note that ``mk_contexts``, ``rm_contexts``, and ``monitor_on_DEPRECATED`` files +are in the root directory only. From dc35e9925ea1796a80a85a85742393d96f048810 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 29 Jan 2024 17:35:48 -0800 Subject: [PATCH 549/707] Docs/translations/damon/usage: update for monitor_on renaming Update DAMON debugfs interface sections on the translated usage documents to reflect the fact that 'monitor_on' file has renamed to 'monitor_on_DEPRECATED'. Link: https://lkml.kernel.org/r/20240130013549.89538-10-sj@kernel.org Signed-off-by: SeongJae Park Reviewed-by: Alex Shi Cc: Hu Haowen <2023002089@link.tyut.edu.cn> Cc: Jonathan Corbet Cc: Shuah Khan Cc: Yanteng Si Signed-off-by: Andrew Morton --- .../zh_CN/admin-guide/mm/damon/usage.rst | 20 +++++++++---------- .../zh_TW/admin-guide/mm/damon/usage.rst | 20 +++++++++---------- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/Documentation/translations/zh_CN/admin-guide/mm/damon/usage.rst b/Documentation/translations/zh_CN/admin-guide/mm/damon/usage.rst index 17b9949d9b4357..da2745464ece45 100644 --- a/Documentation/translations/zh_CN/admin-guide/mm/damon/usage.rst +++ b/Documentation/translations/zh_CN/admin-guide/mm/damon/usage.rst @@ -344,7 +344,7 @@ debugfs接口 :ref:`sysfs接口`。 DAMON导出了八个文件, ``attrs``, ``target_ids``, ``init_regions``, -``schemes``, ``monitor_on``, ``kdamond_pid``, ``mk_contexts`` 和 +``schemes``, ``monitor_on_DEPRECATED``, ``kdamond_pid``, ``mk_contexts`` 和 ``rm_contexts`` under its debugfs directory, ``/damon/``. @@ -521,15 +521,15 @@ DAMON导出了八个文件, ``attrs``, ``target_ids``, ``init_regions``, 开关 ---- -除非你明确地启动监测,否则如上所述的文件设置不会产生效果。你可以通过写入和读取 ``monitor_on`` +除非你明确地启动监测,否则如上所述的文件设置不会产生效果。你可以通过写入和读取 ``monitor_on_DEPRECATED`` 文件来启动、停止和检查监测的当前状态。写入 ``on`` 该文件可以启动对有属性的目标的监测。写入 ``off`` 该文件则停止这些目标。如果每个目标进程被终止,DAMON也会停止。下面的示例命令开启、关 闭和检查DAMON的状态:: # cd /damon - # echo on > monitor_on - # echo off > monitor_on - # cat monitor_on + # echo on > monitor_on_DEPRECATED + # echo off > monitor_on_DEPRECATED + # cat monitor_on_DEPRECATED off 请注意,当监测开启时,你不能写到上述的debugfs文件。如果你在DAMON运行时写到这些文件,将会返 @@ -543,11 +543,11 @@ DAMON通过一个叫做kdamond的内核线程来进行请求监测。你可以 得该线程的 ``pid`` 。当监测被 ``关闭`` 时,读取该文件不会返回任何信息:: # cd /damon - # cat monitor_on + # cat monitor_on_DEPRECATED off # cat kdamond_pid none - # echo on > monitor_on + # echo on > monitor_on_DEPRECATED # cat kdamond_pid 18594 @@ -574,7 +574,7 @@ DAMON通过一个叫做kdamond的内核线程来进行请求监测。你可以 # ls foo # ls: cannot access 'foo': No such file or directory -注意, ``mk_contexts`` 、 ``rm_contexts`` 和 ``monitor_on`` 文件只在根目录下。 +注意, ``mk_contexts`` 、 ``rm_contexts`` 和 ``monitor_on_DEPRECATED`` 文件只在根目录下。 监测结果的监测点 @@ -583,9 +583,9 @@ DAMON通过一个叫做kdamond的内核线程来进行请求监测。你可以 DAMON通过一个tracepoint ``damon:damon_aggregated`` 提供监测结果. 当监测开启时,你可 以记录追踪点事件,并使用追踪点支持工具如perf显示结果。比如说:: - # echo on > monitor_on + # echo on > monitor_on_DEPRECATED # perf record -e damon:damon_aggregated & # sleep 5 # kill 9 $(pidof perf) - # echo off > monitor_on + # echo off > monitor_on_DEPRECATED # perf script diff --git a/Documentation/translations/zh_TW/admin-guide/mm/damon/usage.rst b/Documentation/translations/zh_TW/admin-guide/mm/damon/usage.rst index 6dee719a32ea61..7464279f9b7de0 100644 --- a/Documentation/translations/zh_TW/admin-guide/mm/damon/usage.rst +++ b/Documentation/translations/zh_TW/admin-guide/mm/damon/usage.rst @@ -344,7 +344,7 @@ debugfs接口 :ref:`sysfs接口`。 DAMON導出了八個文件, ``attrs``, ``target_ids``, ``init_regions``, -``schemes``, ``monitor_on``, ``kdamond_pid``, ``mk_contexts`` 和 +``schemes``, ``monitor_on_DEPRECATED``, ``kdamond_pid``, ``mk_contexts`` 和 ``rm_contexts`` under its debugfs directory, ``/damon/``. @@ -521,15 +521,15 @@ DAMON導出了八個文件, ``attrs``, ``target_ids``, ``init_regions``, 開關 ---- -除非你明確地啓動監測,否則如上所述的文件設置不會產生效果。你可以通過寫入和讀取 ``monitor_on`` +除非你明確地啓動監測,否則如上所述的文件設置不會產生效果。你可以通過寫入和讀取 ``monitor_on_DEPRECATED`` 文件來啓動、停止和檢查監測的當前狀態。寫入 ``on`` 該文件可以啓動對有屬性的目標的監測。寫入 ``off`` 該文件則停止這些目標。如果每個目標進程被終止,DAMON也會停止。下面的示例命令開啓、關 閉和檢查DAMON的狀態:: # cd /damon - # echo on > monitor_on - # echo off > monitor_on - # cat monitor_on + # echo on > monitor_on_DEPRECATED + # echo off > monitor_on_DEPRECATED + # cat monitor_on_DEPRECATED off 請注意,當監測開啓時,你不能寫到上述的debugfs文件。如果你在DAMON運行時寫到這些文件,將會返 @@ -543,11 +543,11 @@ DAMON通過一個叫做kdamond的內核線程來進行請求監測。你可以 得該線程的 ``pid`` 。當監測被 ``關閉`` 時,讀取該文件不會返回任何信息:: # cd /damon - # cat monitor_on + # cat monitor_on_DEPRECATED off # cat kdamond_pid none - # echo on > monitor_on + # echo on > monitor_on_DEPRECATED # cat kdamond_pid 18594 @@ -574,7 +574,7 @@ DAMON通過一個叫做kdamond的內核線程來進行請求監測。你可以 # ls foo # ls: cannot access 'foo': No such file or directory -注意, ``mk_contexts`` 、 ``rm_contexts`` 和 ``monitor_on`` 文件只在根目錄下。 +注意, ``mk_contexts`` 、 ``rm_contexts`` 和 ``monitor_on_DEPRECATED`` 文件只在根目錄下。 監測結果的監測點 @@ -583,10 +583,10 @@ DAMON通過一個叫做kdamond的內核線程來進行請求監測。你可以 DAMON通過一個tracepoint ``damon:damon_aggregated`` 提供監測結果. 當監測開啓時,你可 以記錄追蹤點事件,並使用追蹤點支持工具如perf顯示結果。比如說:: - # echo on > monitor_on + # echo on > monitor_on_DEPRECATED # perf record -e damon:damon_aggregated & # sleep 5 # kill 9 $(pidof perf) - # echo off > monitor_on + # echo off > monitor_on_DEPRECATED # perf script From 22c124fdb062f9a4383337e30f4f93bb08792419 Mon Sep 17 00:00:00 2001 From: Yajun Deng Date: Wed, 31 Jan 2024 11:19:13 +0800 Subject: [PATCH 550/707] mm/mmap: use SZ_{8K, 128K} helper macro Use SZ_{8K, 128K} helper macro instead of the number in init_user_reserve and reserve_mem_notifier. This is more readable. Link: https://lkml.kernel.org/r/20240131031913.2058597-1-yajun.deng@linux.dev Signed-off-by: Yajun Deng Signed-off-by: Andrew Morton --- mm/mmap.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 476de5daf598d1..1f9e7024285866 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3845,7 +3845,7 @@ static int init_user_reserve(void) free_kbytes = K(global_zone_page_state(NR_FREE_PAGES)); - sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17); + sysctl_user_reserve_kbytes = min(free_kbytes / 32, SZ_128K); return 0; } subsys_initcall(init_user_reserve); @@ -3866,7 +3866,7 @@ static int init_admin_reserve(void) free_kbytes = K(global_zone_page_state(NR_FREE_PAGES)); - sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13); + sysctl_admin_reserve_kbytes = min(free_kbytes / 32, SZ_8K); return 0; } subsys_initcall(init_admin_reserve); @@ -3898,12 +3898,12 @@ static int reserve_mem_notifier(struct notifier_block *nb, case MEM_ONLINE: /* Default max is 128MB. Leave alone if modified by operator. */ tmp = sysctl_user_reserve_kbytes; - if (0 < tmp && tmp < (1UL << 17)) + if (tmp > 0 && tmp < SZ_128K) init_user_reserve(); /* Default max is 8MB. Leave alone if modified by operator. */ tmp = sysctl_admin_reserve_kbytes; - if (0 < tmp && tmp < (1UL << 13)) + if (tmp > 0 && tmp < SZ_8K) init_admin_reserve(); break; From 4f60623e13c270bab759ff8483dec2062f2a0964 Mon Sep 17 00:00:00 2001 From: Rakie Kim Date: Tue, 30 Jan 2024 13:20:44 -0500 Subject: [PATCH 551/707] mm/mempolicy: implement the sysfs-based weighted_interleave interface MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "mm/mempolicy: weighted interleave mempolicy and sysfs extension", v4. Weighted interleave is a new interleave policy intended to make use of heterogeneous memory environments appearing with CXL. The existing interleave mechanism does an even round-robin distribution of memory across all nodes in a nodemask, while weighted interleave distributes memory across nodes according to a provided weight. (Weight = # of page allocations per round) Weighted interleave is intended to reduce average latency when bandwidth is pressured - therefore increasing total throughput. In other words: It allows greater use of the total available bandwidth in a heterogeneous hardware environment (different hardware provides different bandwidth capacity). As bandwidth is pressured, latency increases - first linearly and then exponentially. By keeping bandwidth usage distributed according to available bandwidth, we therefore can reduce the average latency of a cacheline fetch. A good explanation of the bandwidth vs latency response curve: https://mahmoudhatem.wordpress.com/2017/11/07/memory-bandwidth-vs-latency-response-curve/ From the article: ``` Constant region: The latency response is fairly constant for the first 40% of the sustained bandwidth. Linear region: In between 40% to 80% of the sustained bandwidth, the latency response increases almost linearly with the bandwidth demand of the system due to contention overhead by numerous memory requests. Exponential region: Between 80% to 100% of the sustained bandwidth, the memory latency is dominated by the contention latency which can be as much as twice the idle latency or more. Maximum sustained bandwidth : Is 65% to 75% of the theoretical maximum bandwidth. ``` As a general rule of thumb: * If bandwidth usage is low, latency does not increase. It is optimal to place data in the nearest (lowest latency) device. * If bandwidth usage is high, latency increases. It is optimal to place data such that bandwidth use is optimized per-device. This is the top line goal: Provide a user a mechanism to target using the "maximum sustained bandwidth" of each hardware component in a heterogenous memory system. For example, the stream benchmark demonstrates that 1:1 (default) interleave is actively harmful, while weighted interleave can be beneficial. Default interleave distributes data such that too much pressure is placed on devices with lower available bandwidth. Stream Benchmark (vs DRAM, 1 Socket + 1 CXL Device) Default interleave : -78% (slower than DRAM) Global weighting : -6% to +4% (workload dependant) Targeted weights : +2.5% to +4% (consistently better than DRAM) Global means the task-policy was set (set_mempolicy), while targeted means VMA policies were set (mbind2). We see weighted interleave is not always beneficial when applied globally, but is always beneficial when applied to bandwidth-driving memory regions. There are 3 patches in this set: 1) Implement system-global interleave weights as sysfs extension in mm/mempolicy.c. These weights are RCU protected, and a default weight set is provided (all weights are 1 by default). In future work, we intend to expose an interface for HMAT/CDAT code to set reasonable default values based on the memory configuration of the system discovered at boot/hotplug. 2) A mild refactor of some interleave-logic for re-use in the new weighted interleave logic. 3) MPOL_WEIGHTED_INTERLEAVE extension for set_mempolicy/mbind Included below are some performance and LTP test information, and a sample numactl branch which can be used for testing. = Performance summary = (tests may have different configurations, see extended info below) 1) MLC (W2) : +38% over DRAM. +264% over default interleave. MLC (W5) : +40% over DRAM. +226% over default interleave. 2) Stream : -6% to +4% over DRAM, +430% over default interleave. 3) XSBench : +19% over DRAM. +47% over default interleave. = LTP Testing Summary = existing mempolicy & mbind tests: pass mempolicy & mbind + weighted interleave (global weights): pass = version history v4: - style fixes, code deduplication, simplifications, comments - moved mempolicy->il_weight to task_struct->il_weight - changed logic to simply move forward on il_weight=0 rather than treat il_weight=0 as a special value - detect when il_prev is no longer in the nodemask and move forward - missed weighted interleave check in alloc_pages_mpol() - uninitialized value of nr_allocated in bulk allocator ===================================================================== Performance tests - MLC From - Ravi Jonnalagadda Hardware: Single-socket, multiple CXL memory expanders. Workload: W2 Data Signature: 2:1 read:write DRAM only bandwidth (GBps): 298.8 DRAM + CXL (default interleave) (GBps): 113.04 DRAM + CXL (weighted interleave)(GBps): 412.5 Gain over DRAM only: 1.38x Gain over default interleave: 2.64x Workload: W5 Data Signature: 1:1 read:write DRAM only bandwidth (GBps): 273.2 DRAM + CXL (default interleave) (GBps): 117.23 DRAM + CXL (weighted interleave)(GBps): 382.7 Gain over DRAM only: 1.4x Gain over default interleave: 2.26x ===================================================================== Performance test - Stream From - Gregory Price Hardware: Single socket, single CXL expander numactl extension: https://github.com/gmprice/numactl/tree/weighted_interleave_master Summary: 64 threads, ~18GB workload, 3GB per array, executed 100 times Default interleave : -78% (slower than DRAM) Global weighting : -6% to +4% (workload dependant) mbind2 weights : +2.5% to +4% (consistently better than DRAM) dram only: numactl --cpunodebind=1 --membind=1 ./stream_c.exe --ntimes 100 --array-size 400M --malloc Function Direction BestRateMBs AvgTime MinTime MaxTime Copy: 0->0 200923.2 0.032662 0.031853 0.033301 Scale: 0->0 202123.0 0.032526 0.031664 0.032970 Add: 0->0 208873.2 0.047322 0.045961 0.047884 Triad: 0->0 208523.8 0.047262 0.046038 0.048414 CXL-only: numactl --cpunodebind=1 -w --membind=2 ./stream_c.exe --ntimes 100 --array-size 400M --malloc Copy: 0->0 22209.7 0.288661 0.288162 0.289342 Scale: 0->0 22288.2 0.287549 0.287147 0.288291 Add: 0->0 24419.1 0.393372 0.393135 0.393735 Triad: 0->0 24484.6 0.392337 0.392083 0.394331 Based on the above, the optimal weights are ~9:1 echo 9 > /sys/kernel/mm/mempolicy/weighted_interleave/node1 echo 1 > /sys/kernel/mm/mempolicy/weighted_interleave/node2 default interleave: numactl --cpunodebind=1 --interleave=1,2 ./stream_c.exe --ntimes 100 --array-size 400M --malloc Copy: 0->0 44666.2 0.143671 0.143285 0.144174 Scale: 0->0 44781.6 0.143256 0.142916 0.143713 Add: 0->0 48600.7 0.197719 0.197528 0.197858 Triad: 0->0 48727.5 0.197204 0.197014 0.197439 global weighted interleave: numactl --cpunodebind=1 -w --interleave=1,2 ./stream_c.exe --ntimes 100 --array-size 400M --malloc Copy: 0->0 190085.9 0.034289 0.033669 0.034645 Scale: 0->0 207677.4 0.031909 0.030817 0.033061 Add: 0->0 202036.8 0.048737 0.047516 0.053409 Triad: 0->0 217671.5 0.045819 0.044103 0.046755 targted regions w/ global weights (modified stream to mbind2 malloc'd regions)) numactl --cpunodebind=1 --membind=1 ./stream_c.exe -b --ntimes 100 --array-size 400M --malloc Copy: 0->0 205827.0 0.031445 0.031094 0.031984 Scale: 0->0 208171.8 0.031320 0.030744 0.032505 Add: 0->0 217352.0 0.045087 0.044168 0.046515 Triad: 0->0 216884.8 0.045062 0.044263 0.046982 ===================================================================== Performance tests - XSBench From - Hyeongtak Ji Hardware: Single socket, Single CXL memory Expander NUMA node 0: 56 logical cores, 128 GB memory NUMA node 2: 96 GB CXL memory Threads: 56 Lookups: 170,000,000 Summary: +19% over DRAM. +47% over default interleave. Performance tests - XSBench 1. dram only $ numactl -m 0 ./XSBench -s XL –p 5000000 Runtime: 36.235 seconds Lookups/s: 4,691,618 2. default interleave $ numactl –i 0,2 ./XSBench –s XL –p 5000000 Runtime: 55.243 seconds Lookups/s: 3,077,293 3. weighted interleave numactl –w –i 0,2 ./XSBench –s XL –p 5000000 Runtime: 29.262 seconds Lookups/s: 5,809,513 ===================================================================== LTP Tests: https://github.com/gmprice/ltp/tree/mempolicy2 = Existing tests set_mempolicy, get_mempolicy, mbind MPOL_WEIGHTED_INTERLEAVE added manually to test basic functionality but did not adjust tests for weighting. Basically the weights were set to 1, which is the default, and it should behave the same as MPOL_INTERLEAVE if logic is correct. == set_mempolicy01 : passed 18, failed 0 == set_mempolicy02 : passed 10, failed 0 == set_mempolicy03 : passed 64, failed 0 == set_mempolicy04 : passed 32, failed 0 == set_mempolicy05 - n/a on non-x86 == set_mempolicy06 : passed 10, failed 0 this is set_mempolicy02 + MPOL_WEIGHTED_INTERLEAVE == set_mempolicy07 : passed 32, failed 0 set_mempolicy04 + MPOL_WEIGHTED_INTERLEAVE == get_mempolicy01 : passed 12, failed 0 change: added MPOL_WEIGHTED_INTERLEAVE == get_mempolicy02 : passed 2, failed 0 == mbind01 : passed 15, failed 0 added MPOL_WEIGHTED_INTERLEAVE == mbind02 : passed 4, failed 0 added MPOL_WEIGHTED_INTERLEAVE == mbind03 : passed 16, failed 0 added MPOL_WEIGHTED_INTERLEAVE == mbind04 : passed 48, failed 0 added MPOL_WEIGHTED_INTERLEAVE ===================================================================== numactl (set_mempolicy) w/ global weighting test numactl fork: https://github.com/gmprice/numactl/tree/weighted_interleave_master command: numactl -w --interleave=0,1 ./eatmem result (weights 1:1): 0176a000 weighted interleave:0-1 heap anon=65793 dirty=65793 active=0 N0=32897 N1=32896 kernelpagesize_kB=4 7fceeb9ff000 weighted interleave:0-1 anon=65537 dirty=65537 active=0 N0=32768 N1=32769 kernelpagesize_kB=4 50% distribution is correct result (weights 5:1): 01b14000 weighted interleave:0-1 heap anon=65793 dirty=65793 active=0 N0=54828 N1=10965 kernelpagesize_kB=4 7f47a1dff000 weighted interleave:0-1 anon=65537 dirty=65537 active=0 N0=54614 N1=10923 kernelpagesize_kB=4 16.666% distribution is correct result (weights 1:5): 01f07000 weighted interleave:0-1 heap anon=65793 dirty=65793 active=0 N0=10966 N1=54827 kernelpagesize_kB=4 7f17b1dff000 weighted interleave:0-1 anon=65537 dirty=65537 active=0 N0=10923 N1=54614 kernelpagesize_kB=4 16.666% distribution is correct #include #include #include int main (void) { char* mem = malloc(1024*1024*256); memset(mem, 1, 1024*1024*256); for (int i = 0; i < ((1024*1024*256)/4096); i++) { mem = malloc(4096); mem[0] = 1; } printf("done\n"); getchar(); return 0; } ===================================================================== This patch (of 3): This patch provides a way to set interleave weight information under sysfs at /sys/kernel/mm/mempolicy/weighted_interleave/nodeN The sysfs structure is designed as follows. $ tree /sys/kernel/mm/mempolicy/ /sys/kernel/mm/mempolicy/ [1] └── weighted_interleave [2] ├── node0 [3] └── node1 Each file above can be explained as follows. [1] mm/mempolicy: configuration interface for mempolicy subsystem [2] weighted_interleave/: config interface for weighted interleave policy [3] weighted_interleave/nodeN: weight for nodeN If a node value is set to `0`, the system-default value will be used. As of this patch, the system-default for all nodes is always 1. Link: https://lkml.kernel.org/r/20240130182046.74278-1-gregory.price@memverge.com Link: https://lkml.kernel.org/r/20240130182046.74278-2-gregory.price@memverge.com Suggested-by: Huang Ying Signed-off-by: Rakie Kim Signed-off-by: Honggyu Kim Co-developed-by: Gregory Price Signed-off-by: Gregory Price Co-developed-by: Hyeongtak Ji Signed-off-by: Hyeongtak Ji Reviewed-by: "Huang, Ying" Cc: Andi Kleen Cc: Dan Williams Cc: Frank van der Linden Cc: Gregory Price Cc: Hasan Al Maruf Cc: Johannes Weiner Cc: Jonathan Cameron Cc: Jonathan Corbet Cc: Michal Hocko Cc: Michal Hocko Cc: Srinivasulu Thanneeru Signed-off-by: Andrew Morton --- .../ABI/testing/sysfs-kernel-mm-mempolicy | 4 + ...fs-kernel-mm-mempolicy-weighted-interleave | 25 ++ mm/mempolicy.c | 223 ++++++++++++++++++ 3 files changed, 252 insertions(+) create mode 100644 Documentation/ABI/testing/sysfs-kernel-mm-mempolicy create mode 100644 Documentation/ABI/testing/sysfs-kernel-mm-mempolicy-weighted-interleave diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy b/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy new file mode 100644 index 00000000000000..8ac327fd7fb6e3 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy @@ -0,0 +1,4 @@ +What: /sys/kernel/mm/mempolicy/ +Date: January 2024 +Contact: Linux memory management mailing list +Description: Interface for Mempolicy diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy-weighted-interleave b/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy-weighted-interleave new file mode 100644 index 00000000000000..0b7972de04e939 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy-weighted-interleave @@ -0,0 +1,25 @@ +What: /sys/kernel/mm/mempolicy/weighted_interleave/ +Date: January 2024 +Contact: Linux memory management mailing list +Description: Configuration Interface for the Weighted Interleave policy + +What: /sys/kernel/mm/mempolicy/weighted_interleave/nodeN +Date: January 2024 +Contact: Linux memory management mailing list +Description: Weight configuration interface for nodeN + + The interleave weight for a memory node (N). These weights are + utilized by tasks which have set their mempolicy to + MPOL_WEIGHTED_INTERLEAVE. + + These weights only affect new allocations, and changes at runtime + will not cause migrations on already allocated pages. + + The minimum weight for a node is always 1. + + Minimum weight: 1 + Maximum weight: 255 + + Writing an empty string or `0` will reset the weight to the + system default. The system default may be set by the kernel + or drivers at boot or during hotplug events. diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 5e519163c4dcb6..e62b205b4ee55e 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -131,6 +131,32 @@ static struct mempolicy default_policy = { static struct mempolicy preferred_node_policy[MAX_NUMNODES]; +/* + * iw_table is the sysfs-set interleave weight table, a value of 0 denotes + * system-default value should be used. A NULL iw_table also denotes that + * system-default values should be used. Until the system-default table + * is implemented, the system-default is always 1. + * + * iw_table is RCU protected + */ +static u8 __rcu *iw_table; +static DEFINE_MUTEX(iw_table_lock); + +static u8 get_il_weight(int node) +{ + u8 __rcu *table; + u8 weight; + + rcu_read_lock(); + table = rcu_dereference(iw_table); + /* if no iw_table, use system default */ + weight = table ? table[node] : 1; + /* if value in iw_table is 0, use system default */ + weight = weight ? weight : 1; + rcu_read_unlock(); + return weight; +} + /** * numa_nearest_node - Find nearest node by state * @node: Node id to start the search @@ -3063,3 +3089,200 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) p += scnprintf(p, buffer + maxlen - p, ":%*pbl", nodemask_pr_args(&nodes)); } + +#ifdef CONFIG_SYSFS +struct iw_node_attr { + struct kobj_attribute kobj_attr; + int nid; +}; + +static ssize_t node_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct iw_node_attr *node_attr; + u8 weight; + + node_attr = container_of(attr, struct iw_node_attr, kobj_attr); + weight = get_il_weight(node_attr->nid); + return sysfs_emit(buf, "%d\n", weight); +} + +static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct iw_node_attr *node_attr; + u8 __rcu *new; + u8 __rcu *old; + u8 weight = 0; + + node_attr = container_of(attr, struct iw_node_attr, kobj_attr); + if (count == 0 || sysfs_streq(buf, "")) + weight = 0; + else if (kstrtou8(buf, 0, &weight)) + return -EINVAL; + + new = kzalloc(nr_node_ids, GFP_KERNEL); + if (!new) + return -ENOMEM; + + mutex_lock(&iw_table_lock); + old = rcu_dereference_protected(iw_table, + lockdep_is_held(&iw_table_lock)); + if (old) + memcpy(new, old, nr_node_ids); + new[node_attr->nid] = weight; + rcu_assign_pointer(iw_table, new); + mutex_unlock(&iw_table_lock); + synchronize_rcu(); + kfree(old); + return count; +} + +static struct iw_node_attr **node_attrs; + +static void sysfs_wi_node_release(struct iw_node_attr *node_attr, + struct kobject *parent) +{ + if (!node_attr) + return; + sysfs_remove_file(parent, &node_attr->kobj_attr.attr); + kfree(node_attr->kobj_attr.attr.name); + kfree(node_attr); +} + +static void sysfs_wi_release(struct kobject *wi_kobj) +{ + int i; + + for (i = 0; i < nr_node_ids; i++) + sysfs_wi_node_release(node_attrs[i], wi_kobj); + kobject_put(wi_kobj); +} + +static const struct kobj_type wi_ktype = { + .sysfs_ops = &kobj_sysfs_ops, + .release = sysfs_wi_release, +}; + +static int add_weight_node(int nid, struct kobject *wi_kobj) +{ + struct iw_node_attr *node_attr; + char *name; + + node_attr = kzalloc(sizeof(*node_attr), GFP_KERNEL); + if (!node_attr) + return -ENOMEM; + + name = kasprintf(GFP_KERNEL, "node%d", nid); + if (!name) { + kfree(node_attr); + return -ENOMEM; + } + + sysfs_attr_init(&node_attr->kobj_attr.attr); + node_attr->kobj_attr.attr.name = name; + node_attr->kobj_attr.attr.mode = 0644; + node_attr->kobj_attr.show = node_show; + node_attr->kobj_attr.store = node_store; + node_attr->nid = nid; + + if (sysfs_create_file(wi_kobj, &node_attr->kobj_attr.attr)) { + kfree(node_attr->kobj_attr.attr.name); + kfree(node_attr); + pr_err("failed to add attribute to weighted_interleave\n"); + return -ENOMEM; + } + + node_attrs[nid] = node_attr; + return 0; +} + +static int add_weighted_interleave_group(struct kobject *root_kobj) +{ + struct kobject *wi_kobj; + int nid, err; + + wi_kobj = kzalloc(sizeof(struct kobject), GFP_KERNEL); + if (!wi_kobj) + return -ENOMEM; + + err = kobject_init_and_add(wi_kobj, &wi_ktype, root_kobj, + "weighted_interleave"); + if (err) { + kfree(wi_kobj); + return err; + } + + for_each_node_state(nid, N_POSSIBLE) { + err = add_weight_node(nid, wi_kobj); + if (err) { + pr_err("failed to add sysfs [node%d]\n", nid); + break; + } + } + if (err) + kobject_put(wi_kobj); + return 0; +} + +static void mempolicy_kobj_release(struct kobject *kobj) +{ + u8 __rcu *old; + + mutex_lock(&iw_table_lock); + old = rcu_dereference_protected(iw_table, + lockdep_is_held(&iw_table_lock)); + rcu_assign_pointer(iw_table, NULL); + mutex_unlock(&iw_table_lock); + synchronize_rcu(); + kfree(old); + kfree(node_attrs); + kfree(kobj); +} + +static const struct kobj_type mempolicy_ktype = { + .release = mempolicy_kobj_release +}; + +static int __init mempolicy_sysfs_init(void) +{ + int err; + static struct kobject *mempolicy_kobj; + + mempolicy_kobj = kzalloc(sizeof(*mempolicy_kobj), GFP_KERNEL); + if (!mempolicy_kobj) { + err = -ENOMEM; + goto err_out; + } + + node_attrs = kcalloc(nr_node_ids, sizeof(struct iw_node_attr *), + GFP_KERNEL); + if (!node_attrs) { + err = -ENOMEM; + goto mempol_out; + } + + err = kobject_init_and_add(mempolicy_kobj, &mempolicy_ktype, mm_kobj, + "mempolicy"); + if (err) + goto node_out; + + err = add_weighted_interleave_group(mempolicy_kobj); + if (err) { + pr_err("mempolicy sysfs structure failed to initialize\n"); + kobject_put(mempolicy_kobj); + return err; + } + + return err; +node_out: + kfree(node_attrs); +mempol_out: + kfree(mempolicy_kobj); +err_out: + pr_err("failed to add mempolicy kobject to the system\n"); + return err; +} + +late_initcall(mempolicy_sysfs_init); +#endif /* CONFIG_SYSFS */ From 01e5bdb29292ea9200d2d6ae062fdf7c374ff0e0 Mon Sep 17 00:00:00 2001 From: Gregory Price Date: Tue, 30 Jan 2024 13:20:45 -0500 Subject: [PATCH 552/707] mm/mempolicy: refactor a read-once mechanism into a function for re-use Move the use of barrier() to force policy->nodemask onto the stack into a function `read_once_policy_nodemask` so that it may be re-used. Link: https://lkml.kernel.org/r/20240130182046.74278-3-gregory.price@memverge.com Signed-off-by: Gregory Price Suggested-by: Huang Ying Reviewed-by: "Huang, Ying" Cc: Andi Kleen Cc: Dan Williams Cc: Frank van der Linden Cc: Hasan Al Maruf Cc: Honggyu Kim Cc: Hyeongtak Ji Cc: Johannes Weiner Cc: Jonathan Cameron Cc: Jonathan Corbet Cc: Michal Hocko Cc: Michal Hocko Cc: Rakie Kim Cc: Ravi Jonnalagadda Cc: Srinivasulu Thanneeru Signed-off-by: Andrew Morton --- mm/mempolicy.c | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index e62b205b4ee55e..95682af4c478a5 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1905,6 +1905,20 @@ unsigned int mempolicy_slab_node(void) } } +static unsigned int read_once_policy_nodemask(struct mempolicy *pol, + nodemask_t *mask) +{ + /* + * barrier stabilizes the nodemask locally so that it can be iterated + * over safely without concern for changes. Allocators validate node + * selection does not violate mems_allowed, so this is safe. + */ + barrier(); + memcpy(mask, &pol->nodes, sizeof(nodemask_t)); + barrier(); + return nodes_weight(*mask); +} + /* * Do static interleaving for interleave index @ilx. Returns the ilx'th * node in pol->nodes (starting from ilx=0), wrapping around if ilx @@ -1912,20 +1926,12 @@ unsigned int mempolicy_slab_node(void) */ static unsigned int interleave_nid(struct mempolicy *pol, pgoff_t ilx) { - nodemask_t nodemask = pol->nodes; + nodemask_t nodemask; unsigned int target, nnodes; int i; int nid; - /* - * The barrier will stabilize the nodemask in a register or on - * the stack so that it will stop changing under the code. - * - * Between first_node() and next_node(), pol->nodes could be changed - * by other threads. So we put pol->nodes in a local stack. - */ - barrier(); - nnodes = nodes_weight(nodemask); + nnodes = read_once_policy_nodemask(pol, &nodemask); if (!nnodes) return numa_node_id(); target = ilx % nnodes; From 147371fd14993ae64ef2f612eb15f37fcef12d83 Mon Sep 17 00:00:00 2001 From: Gregory Price Date: Tue, 30 Jan 2024 13:20:46 -0500 Subject: [PATCH 553/707] mm/mempolicy: introduce MPOL_WEIGHTED_INTERLEAVE for weighted interleaving When a system has multiple NUMA nodes and it becomes bandwidth hungry, using the current MPOL_INTERLEAVE could be an wise option. However, if those NUMA nodes consist of different types of memory such as socket-attached DRAM and CXL/PCIe attached DRAM, the round-robin based interleave policy does not optimally distribute data to make use of their different bandwidth characteristics. Instead, interleave is more effective when the allocation policy follows each NUMA nodes' bandwidth weight rather than a simple 1:1 distribution. This patch introduces a new memory policy, MPOL_WEIGHTED_INTERLEAVE, enabling weighted interleave between NUMA nodes. Weighted interleave allows for proportional distribution of memory across multiple numa nodes, preferably apportioned to match the bandwidth of each node. For example, if a system has 1 CPU node (0), and 2 memory nodes (0,1), with bandwidth of (100GB/s, 50GB/s) respectively, the appropriate weight distribution is (2:1). Weights for each node can be assigned via the new sysfs extension: /sys/kernel/mm/mempolicy/weighted_interleave/ For now, the default value of all nodes will be `1`, which matches the behavior of standard 1:1 round-robin interleave. An extension will be added in the future to allow default values to be registered at kernel and device bringup time. The policy allocates a number of pages equal to the set weights. For example, if the weights are (2,1), then 2 pages will be allocated on node0 for every 1 page allocated on node1. The new flag MPOL_WEIGHTED_INTERLEAVE can be used in set_mempolicy(2) and mbind(2). Some high level notes about the pieces of weighted interleave: current->il_prev: Default interleave uses this to track the last used node. Weighted interleave uses this to track the *current* node, and when weight reaches 0 it will be used to acquire the next node. current->il_weight: The active weight of the current node (current->il_prev) When this reaches 0, current->il_prev is set to the next node and current->il_weight is set to the next weight. weighted_interleave_nodes: Counts the number of allocations as they occur, and applies the weight for the current node. When the weight reaches 0, switch to the next node. Operates only on task->mempolicy. weighted_interleave_nid: Gets the total weight of the nodemask as well as each individual node weight, then calculates the node based on the given index. Operates on VMA policies. bulk_array_weighted_interleave: Gets the total weight of the nodemask as well as each individual node weight, then calculates the number of "interleave rounds" as well as any delta ("partial round"). Calculates the number of pages for each node and allocates them. If a node was scheduled for interleave via interleave_nodes, the current weight will be allocated first. Operates only on the task->mempolicy. One piece of complexity is the interaction between a recent refactor which split the logic to acquire the "ilx" (interleave index) of an allocation and the actual application of the interleave. If a call to alloc_pages_mpol() were made with a weighted-interleave policy and ilx set to NO_INTERLEAVE_INDEX, weighted_interleave_nodes() would operate on a VMA policy - violating the description above. An inspection of all callers of alloc_pages_mpol() shows that all external callers set ilx to `0`, an index value, or will call get_vma_policy() to acquire the ilx. For example, mm/shmem.c may call into alloc_pages_mpol. The call stacks all set (pgoff_t ilx) or end up in `get_vma_policy()`. This enforces the `weighted_interleave_nodes()` and `weighted_interleave_nid()` policy requirements (task/vma respectively). Link: https://lkml.kernel.org/r/20240130182046.74278-4-gregory.price@memverge.com Suggested-by: Hasan Al Maruf Signed-off-by: Gregory Price Co-developed-by: Rakie Kim Signed-off-by: Rakie Kim Co-developed-by: Honggyu Kim Signed-off-by: Honggyu Kim Co-developed-by: Hyeongtak Ji Signed-off-by: Hyeongtak Ji Co-developed-by: Srinivasulu Thanneeru Signed-off-by: Srinivasulu Thanneeru Co-developed-by: Ravi Jonnalagadda Signed-off-by: Ravi Jonnalagadda Reviewed-by: "Huang, Ying" Cc: Andi Kleen Cc: Dan Williams Cc: Frank van der Linden Cc: Huang Ying Cc: Johannes Weiner Cc: Jonathan Cameron Cc: Jonathan Corbet Cc: Michal Hocko Cc: Michal Hocko Signed-off-by: Andrew Morton --- .../admin-guide/mm/numa_memory_policy.rst | 9 + include/linux/sched.h | 1 + include/uapi/linux/mempolicy.h | 1 + mm/mempolicy.c | 231 +++++++++++++++++- 4 files changed, 238 insertions(+), 4 deletions(-) diff --git a/Documentation/admin-guide/mm/numa_memory_policy.rst b/Documentation/admin-guide/mm/numa_memory_policy.rst index eca38fa81e0f98..a70f20ce1ffb4f 100644 --- a/Documentation/admin-guide/mm/numa_memory_policy.rst +++ b/Documentation/admin-guide/mm/numa_memory_policy.rst @@ -250,6 +250,15 @@ MPOL_PREFERRED_MANY can fall back to all existing numa nodes. This is effectively MPOL_PREFERRED allowed for a mask rather than a single node. +MPOL_WEIGHTED_INTERLEAVE + This mode operates the same as MPOL_INTERLEAVE, except that + interleaving behavior is executed based on weights set in + /sys/kernel/mm/mempolicy/weighted_interleave/ + + Weighted interleave allocates pages on nodes according to a + weight. For example if nodes [0,1] are weighted [5,2], 5 pages + will be allocated on node0 for every 2 pages allocated on node1. + NUMA memory policy supports the following optional mode flags: MPOL_F_STATIC_NODES diff --git a/include/linux/sched.h b/include/linux/sched.h index cdb8ea53c365ba..e6095570a464e7 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1259,6 +1259,7 @@ struct task_struct { /* Protected by alloc_lock: */ struct mempolicy *mempolicy; short il_prev; + u8 il_weight; short pref_node_fork; #endif #ifdef CONFIG_NUMA_BALANCING diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h index a8963f7ef4c279..1f9bb10d1a473f 100644 --- a/include/uapi/linux/mempolicy.h +++ b/include/uapi/linux/mempolicy.h @@ -23,6 +23,7 @@ enum { MPOL_INTERLEAVE, MPOL_LOCAL, MPOL_PREFERRED_MANY, + MPOL_WEIGHTED_INTERLEAVE, MPOL_MAX, /* always last member of enum */ }; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 95682af4c478a5..eca3e41e7787b9 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -19,6 +19,13 @@ * for anonymous memory. For process policy an process counter * is used. * + * weighted interleave + * Allocate memory interleaved over a set of nodes based on + * a set of weights (per-node), with normal fallback if it + * fails. Otherwise operates the same as interleave. + * Example: nodeset(0,1) & weights (2,1) - 2 pages allocated + * on node 0 for every 1 page allocated on node 1. + * * bind Only allocate memory on a specific set of nodes, * no fallback. * FIXME: memory is allocated starting with the first node @@ -441,6 +448,10 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = { .create = mpol_new_nodemask, .rebind = mpol_rebind_preferred, }, + [MPOL_WEIGHTED_INTERLEAVE] = { + .create = mpol_new_nodemask, + .rebind = mpol_rebind_nodemask, + }, }; static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist, @@ -858,8 +869,11 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags, old = current->mempolicy; current->mempolicy = new; - if (new && new->mode == MPOL_INTERLEAVE) + if (new && (new->mode == MPOL_INTERLEAVE || + new->mode == MPOL_WEIGHTED_INTERLEAVE)) { current->il_prev = MAX_NUMNODES-1; + current->il_weight = 0; + } task_unlock(current); mpol_put(old); ret = 0; @@ -884,6 +898,7 @@ static void get_policy_nodemask(struct mempolicy *pol, nodemask_t *nodes) case MPOL_INTERLEAVE: case MPOL_PREFERRED: case MPOL_PREFERRED_MANY: + case MPOL_WEIGHTED_INTERLEAVE: *nodes = pol->nodes; break; case MPOL_LOCAL: @@ -968,6 +983,13 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, } else if (pol == current->mempolicy && pol->mode == MPOL_INTERLEAVE) { *policy = next_node_in(current->il_prev, pol->nodes); + } else if (pol == current->mempolicy && + pol->mode == MPOL_WEIGHTED_INTERLEAVE) { + if (current->il_weight) + *policy = current->il_prev; + else + *policy = next_node_in(current->il_prev, + pol->nodes); } else { err = -EINVAL; goto out; @@ -1332,7 +1354,8 @@ static long do_mbind(unsigned long start, unsigned long len, * VMAs, the nodes will still be interleaved from the targeted * nodemask, but one by one may be selected differently. */ - if (new->mode == MPOL_INTERLEAVE) { + if (new->mode == MPOL_INTERLEAVE || + new->mode == MPOL_WEIGHTED_INTERLEAVE) { struct page *page; unsigned int order; unsigned long addr = -EFAULT; @@ -1780,7 +1803,8 @@ struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, * @vma: virtual memory area whose policy is sought * @addr: address in @vma for shared policy lookup * @order: 0, or appropriate huge_page_order for interleaving - * @ilx: interleave index (output), for use only when MPOL_INTERLEAVE + * @ilx: interleave index (output), for use only when MPOL_INTERLEAVE or + * MPOL_WEIGHTED_INTERLEAVE * * Returns effective policy for a VMA at specified address. * Falls back to current->mempolicy or system default policy, as necessary. @@ -1797,7 +1821,8 @@ struct mempolicy *get_vma_policy(struct vm_area_struct *vma, pol = __get_vma_policy(vma, addr, ilx); if (!pol) pol = get_task_policy(current); - if (pol->mode == MPOL_INTERLEAVE) { + if (pol->mode == MPOL_INTERLEAVE || + pol->mode == MPOL_WEIGHTED_INTERLEAVE) { *ilx += vma->vm_pgoff >> order; *ilx += (addr - vma->vm_start) >> (PAGE_SHIFT + order); } @@ -1847,6 +1872,22 @@ bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone) return zone >= dynamic_policy_zone; } +static unsigned int weighted_interleave_nodes(struct mempolicy *policy) +{ + unsigned int node = current->il_prev; + + if (!current->il_weight || !node_isset(node, policy->nodes)) { + node = next_node_in(node, policy->nodes); + /* can only happen if nodemask is being rebound */ + if (node == MAX_NUMNODES) + return node; + current->il_prev = node; + current->il_weight = get_il_weight(node); + } + current->il_weight--; + return node; +} + /* Do dynamic interleaving for a process */ static unsigned int interleave_nodes(struct mempolicy *policy) { @@ -1881,6 +1922,9 @@ unsigned int mempolicy_slab_node(void) case MPOL_INTERLEAVE: return interleave_nodes(policy); + case MPOL_WEIGHTED_INTERLEAVE: + return weighted_interleave_nodes(policy); + case MPOL_BIND: case MPOL_PREFERRED_MANY: { @@ -1919,6 +1963,45 @@ static unsigned int read_once_policy_nodemask(struct mempolicy *pol, return nodes_weight(*mask); } +static unsigned int weighted_interleave_nid(struct mempolicy *pol, pgoff_t ilx) +{ + nodemask_t nodemask; + unsigned int target, nr_nodes; + u8 __rcu *table; + unsigned int weight_total = 0; + u8 weight; + int nid; + + nr_nodes = read_once_policy_nodemask(pol, &nodemask); + if (!nr_nodes) + return numa_node_id(); + + rcu_read_lock(); + table = rcu_dereference(iw_table); + /* calculate the total weight */ + for_each_node_mask(nid, nodemask) { + /* detect system default usage */ + weight = table ? table[nid] : 1; + weight = weight ? weight : 1; + weight_total += weight; + } + + /* Calculate the node offset based on totals */ + target = ilx % weight_total; + nid = first_node(nodemask); + while (target) { + /* detect system default usage */ + weight = table ? table[nid] : 1; + weight = weight ? weight : 1; + if (target < weight) + break; + target -= weight; + nid = next_node_in(nid, nodemask); + } + rcu_read_unlock(); + return nid; +} + /* * Do static interleaving for interleave index @ilx. Returns the ilx'th * node in pol->nodes (starting from ilx=0), wrapping around if ilx @@ -1979,6 +2062,11 @@ static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol, *nid = (ilx == NO_INTERLEAVE_INDEX) ? interleave_nodes(pol) : interleave_nid(pol, ilx); break; + case MPOL_WEIGHTED_INTERLEAVE: + *nid = (ilx == NO_INTERLEAVE_INDEX) ? + weighted_interleave_nodes(pol) : + weighted_interleave_nid(pol, ilx); + break; } return nodemask; @@ -2040,6 +2128,7 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask) case MPOL_PREFERRED_MANY: case MPOL_BIND: case MPOL_INTERLEAVE: + case MPOL_WEIGHTED_INTERLEAVE: *mask = mempolicy->nodes; break; @@ -2140,6 +2229,7 @@ struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order, * node in its nodemask, we allocate the standard way. */ if (pol->mode != MPOL_INTERLEAVE && + pol->mode != MPOL_WEIGHTED_INTERLEAVE && (!nodemask || node_isset(nid, *nodemask))) { /* * First, try to allocate THP only on local node, but @@ -2275,6 +2365,127 @@ static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp, return total_allocated; } +static unsigned long alloc_pages_bulk_array_weighted_interleave(gfp_t gfp, + struct mempolicy *pol, unsigned long nr_pages, + struct page **page_array) +{ + struct task_struct *me = current; + unsigned long total_allocated = 0; + unsigned long nr_allocated = 0; + unsigned long rounds; + unsigned long node_pages, delta; + u8 __rcu *table, *weights, weight; + unsigned int weight_total = 0; + unsigned long rem_pages = nr_pages; + nodemask_t nodes; + int nnodes, node, next_node; + int resume_node = MAX_NUMNODES - 1; + u8 resume_weight = 0; + int prev_node; + int i; + + if (!nr_pages) + return 0; + + nnodes = read_once_policy_nodemask(pol, &nodes); + if (!nnodes) + return 0; + + /* Continue allocating from most recent node and adjust the nr_pages */ + node = me->il_prev; + weight = me->il_weight; + if (weight && node_isset(node, nodes)) { + node_pages = min(rem_pages, weight); + nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages, + NULL, page_array); + page_array += nr_allocated; + total_allocated += nr_allocated; + /* if that's all the pages, no need to interleave */ + if (rem_pages < weight) { + /* stay on current node, adjust il_weight */ + me->il_weight -= rem_pages; + return total_allocated; + } else if (rem_pages == weight) { + /* move to next node / weight */ + me->il_prev = next_node_in(node, nodes); + me->il_weight = get_il_weight(next_node); + return total_allocated; + } + /* Otherwise we adjust remaining pages, continue from there */ + rem_pages -= weight; + } + /* clear active weight in case of an allocation failure */ + me->il_weight = 0; + prev_node = node; + + /* create a local copy of node weights to operate on outside rcu */ + weights = kzalloc(nr_node_ids, GFP_KERNEL); + if (!weights) + return total_allocated; + + rcu_read_lock(); + table = rcu_dereference(iw_table); + if (table) + memcpy(weights, table, nr_node_ids); + rcu_read_unlock(); + + /* calculate total, detect system default usage */ + for_each_node_mask(node, nodes) { + if (!weights[node]) + weights[node] = 1; + weight_total += weights[node]; + } + + /* + * Calculate rounds/partial rounds to minimize __alloc_pages_bulk calls. + * Track which node weighted interleave should resume from. + * + * if (rounds > 0) and (delta == 0), resume_node will always be + * the node following prev_node and its weight. + */ + rounds = rem_pages / weight_total; + delta = rem_pages % weight_total; + resume_node = next_node_in(prev_node, nodes); + resume_weight = weights[resume_node]; + for (i = 0; i < nnodes; i++) { + node = next_node_in(prev_node, nodes); + weight = weights[node]; + node_pages = weight * rounds; + /* If a delta exists, add this node's portion of the delta */ + if (delta > weight) { + node_pages += weight; + delta -= weight; + } else if (delta) { + node_pages += delta; + /* delta may deplete on a boundary or w/ a remainder */ + if (delta == weight) { + /* boundary: resume from next node/weight */ + resume_node = next_node_in(node, nodes); + resume_weight = weights[resume_node]; + } else { + /* remainder: resume this node w/ remainder */ + resume_node = node; + resume_weight = weight - delta; + } + delta = 0; + } + /* node_pages can be 0 if an allocation fails and rounds == 0 */ + if (!node_pages) + break; + nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages, + NULL, page_array); + page_array += nr_allocated; + total_allocated += nr_allocated; + if (total_allocated == nr_pages) + break; + prev_node = node; + } + me->il_prev = resume_node; + me->il_weight = resume_weight; + kfree(weights); + return total_allocated; +} + static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid, struct mempolicy *pol, unsigned long nr_pages, struct page **page_array) @@ -2315,6 +2526,10 @@ unsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp, return alloc_pages_bulk_array_interleave(gfp, pol, nr_pages, page_array); + if (pol->mode == MPOL_WEIGHTED_INTERLEAVE) + return alloc_pages_bulk_array_weighted_interleave( + gfp, pol, nr_pages, page_array); + if (pol->mode == MPOL_PREFERRED_MANY) return alloc_pages_bulk_array_preferred_many(gfp, numa_node_id(), pol, nr_pages, page_array); @@ -2390,6 +2605,7 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b) case MPOL_INTERLEAVE: case MPOL_PREFERRED: case MPOL_PREFERRED_MANY: + case MPOL_WEIGHTED_INTERLEAVE: return !!nodes_equal(a->nodes, b->nodes); case MPOL_LOCAL: return true; @@ -2526,6 +2742,10 @@ int mpol_misplaced(struct folio *folio, struct vm_area_struct *vma, polnid = interleave_nid(pol, ilx); break; + case MPOL_WEIGHTED_INTERLEAVE: + polnid = weighted_interleave_nid(pol, ilx); + break; + case MPOL_PREFERRED: if (node_isset(curnid, pol->nodes)) goto out; @@ -2900,6 +3120,7 @@ static const char * const policy_modes[] = [MPOL_PREFERRED] = "prefer", [MPOL_BIND] = "bind", [MPOL_INTERLEAVE] = "interleave", + [MPOL_WEIGHTED_INTERLEAVE] = "weighted interleave", [MPOL_LOCAL] = "local", [MPOL_PREFERRED_MANY] = "prefer (many)", }; @@ -2959,6 +3180,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpol) } break; case MPOL_INTERLEAVE: + case MPOL_WEIGHTED_INTERLEAVE: /* * Default to online nodes with memory if no nodelist */ @@ -3069,6 +3291,7 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) case MPOL_PREFERRED_MANY: case MPOL_BIND: case MPOL_INTERLEAVE: + case MPOL_WEIGHTED_INTERLEAVE: nodes = pol->nodes; break; default: From 041aafc133cc4c8826c58d8c0da7635cce9444cd Mon Sep 17 00:00:00 2001 From: Gregory Price Date: Wed, 31 Jan 2024 00:12:24 -0500 Subject: [PATCH 554/707] mm-mempolicy-introduce-mpol_weighted_interleave-for-weighted-interleaving-fix. kill next_node in favor of operating directly on il_prev Link: https://lkml.kernel.org/r/ZbnWuB4dRCEFRz2m@memverge.com Signed-off-by: Gregory Price Reviewed-by: "Huang, Ying" Cc: Andi Kleen Cc: Dan Williams Cc: Frank van der Linden Cc: Hasan Al Maruf Cc: Honggyu Kim Cc: Huang Ying Cc: Hyeongtak Ji Cc: Johannes Weiner Cc: Jonathan Cameron Cc: Jonathan Corbet Cc: Michal Hocko Cc: Michal Hocko Cc: Rakie Kim Cc: Ravi Jonnalagadda Cc: Srinivasulu Thanneeru Signed-off-by: Andrew Morton --- mm/mempolicy.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index eca3e41e7787b9..bbf7cd328ea6eb 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2378,7 +2378,7 @@ static unsigned long alloc_pages_bulk_array_weighted_interleave(gfp_t gfp, unsigned int weight_total = 0; unsigned long rem_pages = nr_pages; nodemask_t nodes; - int nnodes, node, next_node; + int nnodes, node; int resume_node = MAX_NUMNODES - 1; u8 resume_weight = 0; int prev_node; @@ -2408,7 +2408,7 @@ static unsigned long alloc_pages_bulk_array_weighted_interleave(gfp_t gfp, } else if (rem_pages == weight) { /* move to next node / weight */ me->il_prev = next_node_in(node, nodes); - me->il_weight = get_il_weight(next_node); + me->il_weight = get_il_weight(me->il_prev); return total_allocated; } /* Otherwise we adjust remaining pages, continue from there */ From f5e62bcf472e53a5e37cf9ebe46d5067442541f1 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 30 Jan 2024 11:34:32 +0100 Subject: [PATCH 555/707] arm: ptdump: rename CONFIG_DEBUG_WX to CONFIG_ARM_DEBUG_WX Patch series "mm: ptdump: Refactor CONFIG_DEBUG_WX and check_wx_pages debugfs attribute", v2. This series refactors CONFIG_DEBUG_WX for the 5 architectures implementing CONFIG_GENERIC_PTDUMP First rename stuff in ARM which uses similar names while not implementing CONFIG_GENERIC_PTDUMP. Then define a generic version of debug_checkwx() that calls ptdump_check_wx() when CONFIG_DEBUG_WX is set. Call it immediately after calling mark_rodata_ro() instead of calling it at the end of every mark_rodata_ro(). Then implement a debugfs attribute that can be used to trigger a W^X test at anytime and regardless of CONFIG_DEBUG_WX CONFIG_DEBUG_WX is a core option defined in mm/Kconfig.debug To avoid any future conflict, rename ARM version into CONFIG_ARM_DEBUG_WX. Link: https://lore.kernel.org/lkml/20200422152656.GF676@willie-the-truck/T/#m802eaf33efd6f8d575939d157301b35ac0d4a64f Link: https://github.com/KSPP/linux/issues/35 Link: https://lkml.kernel.org/r/cover.1706610398.git.christophe.leroy@csgroup.eu Link: https://lkml.kernel.org/r/fa297aa90caeb61eee2b70c6c5897a2ab58a9562.1706610398.git.christophe.leroy@csgroup.eu Signed-off-by: Christophe Leroy Cc: Albert Ou Cc: Alexander Gordeev Cc: Andy Lutomirski Cc: "Aneesh Kumar K.V (IBM)" Cc: Borislav Petkov (AMD) Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Dave Hansen Cc: Gerald Schaefer Cc: Greg KH Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Kees Cook Cc: Mark Rutland Cc: Michael Ellerman Cc: "Naveen N. Rao" Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Phong Tran Cc: Russell King Cc: Steven Price Cc: Sven Schnelle Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Will Deacon Cc: Alexandre Ghiti Signed-off-by: Andrew Morton --- arch/arm/Kconfig.debug | 2 +- arch/arm/configs/aspeed_g4_defconfig | 2 +- arch/arm/configs/aspeed_g5_defconfig | 2 +- arch/arm/include/asm/ptdump.h | 6 +++--- arch/arm/mm/init.c | 2 +- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/arm/Kconfig.debug b/arch/arm/Kconfig.debug index 5fbbac1b708b0a..f1fc278081d035 100644 --- a/arch/arm/Kconfig.debug +++ b/arch/arm/Kconfig.debug @@ -17,7 +17,7 @@ config ARM_PTDUMP_DEBUGFS kernel. If in doubt, say "N" -config DEBUG_WX +config ARM_DEBUG_WX bool "Warn on W+X mappings at boot" depends on MMU select ARM_PTDUMP_CORE diff --git a/arch/arm/configs/aspeed_g4_defconfig b/arch/arm/configs/aspeed_g4_defconfig index b3dc0465796f9a..28b724d59e7e23 100644 --- a/arch/arm/configs/aspeed_g4_defconfig +++ b/arch/arm/configs/aspeed_g4_defconfig @@ -252,7 +252,7 @@ CONFIG_DEBUG_INFO_REDUCED=y CONFIG_GDB_SCRIPTS=y CONFIG_STRIP_ASM_SYMS=y CONFIG_DEBUG_FS=y -CONFIG_DEBUG_WX=y +CONFIG_ARM_DEBUG_WX=y CONFIG_SCHED_STACK_END_CHECK=y CONFIG_PANIC_ON_OOPS=y CONFIG_PANIC_TIMEOUT=-1 diff --git a/arch/arm/configs/aspeed_g5_defconfig b/arch/arm/configs/aspeed_g5_defconfig index 3fdf4dbfdea5db..61cee1e7ebea61 100644 --- a/arch/arm/configs/aspeed_g5_defconfig +++ b/arch/arm/configs/aspeed_g5_defconfig @@ -302,7 +302,7 @@ CONFIG_DEBUG_INFO_REDUCED=y CONFIG_GDB_SCRIPTS=y CONFIG_STRIP_ASM_SYMS=y CONFIG_DEBUG_FS=y -CONFIG_DEBUG_WX=y +CONFIG_ARM_DEBUG_WX=y CONFIG_SCHED_STACK_END_CHECK=y CONFIG_PANIC_ON_OOPS=y CONFIG_PANIC_TIMEOUT=-1 diff --git a/arch/arm/include/asm/ptdump.h b/arch/arm/include/asm/ptdump.h index aad1d034136cea..46a4575146ee85 100644 --- a/arch/arm/include/asm/ptdump.h +++ b/arch/arm/include/asm/ptdump.h @@ -32,10 +32,10 @@ void ptdump_check_wx(void); #endif /* CONFIG_ARM_PTDUMP_CORE */ -#ifdef CONFIG_DEBUG_WX -#define debug_checkwx() ptdump_check_wx() +#ifdef CONFIG_ARM_DEBUG_WX +#define arm_debug_checkwx() ptdump_check_wx() #else -#define debug_checkwx() do { } while (0) +#define arm_debug_checkwx() do { } while (0) #endif #endif /* __ASM_PTDUMP_H */ diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c index a42e4cd11db294..4c3d78691279d3 100644 --- a/arch/arm/mm/init.c +++ b/arch/arm/mm/init.c @@ -458,7 +458,7 @@ static int __mark_rodata_ro(void *unused) void mark_rodata_ro(void) { stop_machine(__mark_rodata_ro, NULL, NULL); - debug_checkwx(); + arm_debug_checkwx(); } #else From b7062272ee15bad310018ee4538a9b229e70062f Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 30 Jan 2024 11:34:33 +0100 Subject: [PATCH 556/707] arm64, powerpc, riscv, s390, x86: ptdump: refactor CONFIG_DEBUG_WX All architectures using the core ptdump functionality also implement CONFIG_DEBUG_WX, and they all do it more or less the same way, with a function called debug_checkwx() that is called by mark_rodata_ro(), which is a substitute to ptdump_check_wx() when CONFIG_DEBUG_WX is set and a no-op otherwise. Refactor by centrally defining debug_checkwx() in linux/ptdump.h and call debug_checkwx() immediately after calling mark_rodata_ro() instead of calling it at the end of every mark_rodata_ro(). On x86_32, mark_rodata_ro() first checks __supported_pte_mask has _PAGE_NX before calling debug_checkwx(). Now the check is inside the callee ptdump_walk_pgd_level_checkwx(). On powerpc_64, mark_rodata_ro() bails out early before calling ptdump_check_wx() when the MMU doesn't have KERNEL_RO feature. The check is now also done in ptdump_check_wx() as it is called outside mark_rodata_ro(). Link: https://lkml.kernel.org/r/a59b102d7964261d31ead0316a9f18628e4e7a8e.1706610398.git.christophe.leroy@csgroup.eu Signed-off-by: Christophe Leroy Reviewed-by: Alexandre Ghiti Cc: Albert Ou Cc: Alexander Gordeev Cc: Andy Lutomirski Cc: "Aneesh Kumar K.V (IBM)" Cc: Borislav Petkov (AMD) Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Dave Hansen Cc: Gerald Schaefer Cc: Greg KH Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Kees Cook Cc: Mark Rutland Cc: Michael Ellerman Cc: "Naveen N. Rao" Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Phong Tran Cc: Russell King Cc: Steven Price Cc: Sven Schnelle Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/arm64/include/asm/ptdump.h | 7 ------- arch/arm64/mm/mmu.c | 2 -- arch/powerpc/mm/mmu_decl.h | 6 ------ arch/powerpc/mm/pgtable_32.c | 4 ---- arch/powerpc/mm/pgtable_64.c | 3 --- arch/powerpc/mm/ptdump/ptdump.c | 3 +++ arch/riscv/include/asm/ptdump.h | 22 ---------------------- arch/riscv/mm/init.c | 3 --- arch/riscv/mm/ptdump.c | 1 - arch/s390/include/asm/ptdump.h | 14 -------------- arch/s390/mm/dump_pagetables.c | 1 - arch/s390/mm/init.c | 2 -- arch/x86/include/asm/pgtable.h | 3 +-- arch/x86/mm/dump_pagetables.c | 3 +++ arch/x86/mm/init_32.c | 2 -- arch/x86/mm/init_64.c | 2 -- include/linux/ptdump.h | 7 +++++++ init/main.c | 2 ++ 18 files changed, 16 insertions(+), 71 deletions(-) delete mode 100644 arch/riscv/include/asm/ptdump.h delete mode 100644 arch/s390/include/asm/ptdump.h diff --git a/arch/arm64/include/asm/ptdump.h b/arch/arm64/include/asm/ptdump.h index 581caac525b03a..5b1701c76d1cec 100644 --- a/arch/arm64/include/asm/ptdump.h +++ b/arch/arm64/include/asm/ptdump.h @@ -29,13 +29,6 @@ void __init ptdump_debugfs_register(struct ptdump_info *info, const char *name); static inline void ptdump_debugfs_register(struct ptdump_info *info, const char *name) { } #endif -void ptdump_check_wx(void); #endif /* CONFIG_PTDUMP_CORE */ -#ifdef CONFIG_DEBUG_WX -#define debug_checkwx() ptdump_check_wx() -#else -#define debug_checkwx() do { } while (0) -#endif - #endif /* __ASM_PTDUMP_H */ diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index d794b2f4b5a3cd..34f3f777c4f192 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -632,8 +632,6 @@ void mark_rodata_ro(void) section_size = (unsigned long)__init_begin - (unsigned long)__start_rodata; update_mapping_prot(__pa_symbol(__start_rodata), (unsigned long)__start_rodata, section_size, PAGE_KERNEL_RO); - - debug_checkwx(); } static void __init map_kernel_segment(pgd_t *pgdp, void *va_start, void *va_end, diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h index 72341b9fb5521f..90dcc284405629 100644 --- a/arch/powerpc/mm/mmu_decl.h +++ b/arch/powerpc/mm/mmu_decl.h @@ -171,12 +171,6 @@ static inline void mmu_mark_rodata_ro(void) { } void __init mmu_mapin_immr(void); #endif -#ifdef CONFIG_DEBUG_WX -void ptdump_check_wx(void); -#else -static inline void ptdump_check_wx(void) { } -#endif - static inline bool debug_pagealloc_enabled_or_kfence(void) { return IS_ENABLED(CONFIG_KFENCE) || debug_pagealloc_enabled(); diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c index 5c02fd08d61eff..12498017da8e43 100644 --- a/arch/powerpc/mm/pgtable_32.c +++ b/arch/powerpc/mm/pgtable_32.c @@ -153,7 +153,6 @@ void mark_rodata_ro(void) if (v_block_mapped((unsigned long)_stext + 1)) { mmu_mark_rodata_ro(); - ptdump_check_wx(); return; } @@ -166,9 +165,6 @@ void mark_rodata_ro(void) PFN_DOWN((unsigned long)_stext); set_memory_ro((unsigned long)_stext, numpages); - - // mark_initmem_nx() should have already run by now - ptdump_check_wx(); } #endif diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index 5ac1fd30341bb2..1b366526f4f21e 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -150,9 +150,6 @@ void mark_rodata_ro(void) radix__mark_rodata_ro(); else hash__mark_rodata_ro(); - - // mark_initmem_nx() should have already run by now - ptdump_check_wx(); } void mark_initmem_nx(void) diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c index 2313053fe679ed..620d4917ebe8a6 100644 --- a/arch/powerpc/mm/ptdump/ptdump.c +++ b/arch/powerpc/mm/ptdump/ptdump.c @@ -343,6 +343,9 @@ void ptdump_check_wx(void) } }; + if (IS_ENABLED(CONFIG_PPC_BOOK3S_64) && !mmu_has_feature(MMU_FTR_KERNEL_RO)) + return; + ptdump_walk_pgd(&st.ptdump, &init_mm, NULL); if (st.wx_pages) diff --git a/arch/riscv/include/asm/ptdump.h b/arch/riscv/include/asm/ptdump.h deleted file mode 100644 index 3c9ea6dd5af7eb..00000000000000 --- a/arch/riscv/include/asm/ptdump.h +++ /dev/null @@ -1,22 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Copyright (C) 2019 SiFive - */ - -#ifndef _ASM_RISCV_PTDUMP_H -#define _ASM_RISCV_PTDUMP_H - -void ptdump_check_wx(void); - -#ifdef CONFIG_DEBUG_WX -static inline void debug_checkwx(void) -{ - ptdump_check_wx(); -} -#else -static inline void debug_checkwx(void) -{ -} -#endif - -#endif /* _ASM_RISCV_PTDUMP_H */ diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 245919dda91043..a69d8ce289ae1a 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -29,7 +29,6 @@ #include #include #include -#include #include #include #include @@ -723,8 +722,6 @@ void mark_rodata_ro(void) if (IS_ENABLED(CONFIG_64BIT)) set_kernel_memory(lm_alias(__start_rodata), lm_alias(_data), set_memory_ro); - - debug_checkwx(); } #else static __init pgprot_t pgprot_from_va(uintptr_t va) diff --git a/arch/riscv/mm/ptdump.c b/arch/riscv/mm/ptdump.c index 657c27bc07a769..07526560331366 100644 --- a/arch/riscv/mm/ptdump.c +++ b/arch/riscv/mm/ptdump.c @@ -9,7 +9,6 @@ #include #include -#include #include #include diff --git a/arch/s390/include/asm/ptdump.h b/arch/s390/include/asm/ptdump.h deleted file mode 100644 index f960b2896606a1..00000000000000 --- a/arch/s390/include/asm/ptdump.h +++ /dev/null @@ -1,14 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ - -#ifndef _ASM_S390_PTDUMP_H -#define _ASM_S390_PTDUMP_H - -void ptdump_check_wx(void); - -static inline void debug_checkwx(void) -{ - if (IS_ENABLED(CONFIG_DEBUG_WX)) - ptdump_check_wx(); -} - -#endif /* _ASM_S390_PTDUMP_H */ diff --git a/arch/s390/mm/dump_pagetables.c b/arch/s390/mm/dump_pagetables.c index d37a8f607b7188..8dcb4e0c71bde6 100644 --- a/arch/s390/mm/dump_pagetables.c +++ b/arch/s390/mm/dump_pagetables.c @@ -6,7 +6,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index 8d9a60ccb7771a..f6391442c0c2ad 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -37,7 +37,6 @@ #include #include #include -#include #include #include #include @@ -109,7 +108,6 @@ void mark_rodata_ro(void) __set_memory_ro(__start_ro_after_init, __end_ro_after_init); pr_info("Write protected read-only-after-init data: %luk\n", size >> 10); - debug_checkwx(); } int set_memory_encrypted(unsigned long vaddr, int numpages) diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 9d077bca6a103e..6c979028e5212f 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -32,6 +32,7 @@ void ptdump_walk_pgd_level(struct seq_file *m, struct mm_struct *mm); void ptdump_walk_pgd_level_debugfs(struct seq_file *m, struct mm_struct *mm, bool user); void ptdump_walk_pgd_level_checkwx(void); +#define ptdump_check_wx ptdump_walk_pgd_level_checkwx void ptdump_walk_user_pgd_level_checkwx(void); /* @@ -41,10 +42,8 @@ void ptdump_walk_user_pgd_level_checkwx(void); #define pgprot_decrypted(prot) __pgprot(cc_mkdec(pgprot_val(prot))) #ifdef CONFIG_DEBUG_WX -#define debug_checkwx() ptdump_walk_pgd_level_checkwx() #define debug_checkwx_user() ptdump_walk_user_pgd_level_checkwx() #else -#define debug_checkwx() do { } while (0) #define debug_checkwx_user() do { } while (0) #endif diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index e1b599ecbbc26d..0008524eebe9af 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -433,6 +433,9 @@ void ptdump_walk_user_pgd_level_checkwx(void) void ptdump_walk_pgd_level_checkwx(void) { + if (!(__supported_pte_mask & _PAGE_NX)) + return; + ptdump_walk_pgd_level_core(NULL, &init_mm, INIT_PGD, true, false); } diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index b63403d7179df4..5c736b707caea0 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -800,6 +800,4 @@ void mark_rodata_ro(void) set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); #endif mark_nxdata_nx(); - if (__supported_pte_mask & _PAGE_NX) - debug_checkwx(); } diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index a0dffaca6d2bfc..ebdbcae48011d4 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1412,8 +1412,6 @@ void mark_rodata_ro(void) (void *)text_end, (void *)rodata_start); free_kernel_image_pages("unused kernel image (rodata/data gap)", (void *)rodata_end, (void *)_sdata); - - debug_checkwx(); } /* diff --git a/include/linux/ptdump.h b/include/linux/ptdump.h index 2a3a955864259a..c10513739bf951 100644 --- a/include/linux/ptdump.h +++ b/include/linux/ptdump.h @@ -19,5 +19,12 @@ struct ptdump_state { }; void ptdump_walk_pgd(struct ptdump_state *st, struct mm_struct *mm, pgd_t *pgd); +void ptdump_check_wx(void); + +static inline void debug_checkwx(void) +{ + if (IS_ENABLED(CONFIG_DEBUG_WX)) + ptdump_check_wx(); +} #endif /* _LINUX_PTDUMP_H */ diff --git a/init/main.c b/init/main.c index e24b0780fdff7a..749a9f8d2c9b0d 100644 --- a/init/main.c +++ b/init/main.c @@ -99,6 +99,7 @@ #include #include #include +#include #include #include @@ -1408,6 +1409,7 @@ static void mark_readonly(void) */ rcu_barrier(); mark_rodata_ro(); + debug_checkwx(); rodata_test(); } else pr_info("Kernel memory protection disabled.\n"); From 938533d46be747f58fc495c04ef8cb1fa2b2f67c Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 30 Jan 2024 11:34:34 +0100 Subject: [PATCH 557/707] powerpc,s390: ptdump: define ptdump_check_wx() regardless of CONFIG_DEBUG_WX Following patch will use ptdump_check_wx() regardless of CONFIG_DEBUG_WX, so define it at all times on powerpc and s390 just like other architectures. Though keep the WARN_ON_ONCE() only when CONFIG_DEBUG_WX is set. Link: https://lkml.kernel.org/r/07bfb04c7fec58e84413e91d2533581be357a696.1706610398.git.christophe.leroy@csgroup.eu Signed-off-by: Christophe Leroy Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Andy Lutomirski Cc: "Aneesh Kumar K.V (IBM)" Cc: Borislav Petkov (AMD) Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Dave Hansen Cc: Gerald Schaefer Cc: Greg KH Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Kees Cook Cc: Mark Rutland Cc: Michael Ellerman Cc: "Naveen N. Rao" Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Phong Tran Cc: Russell King Cc: Steven Price Cc: Sven Schnelle Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/powerpc/mm/ptdump/ptdump.c | 7 +++---- arch/s390/mm/dump_pagetables.c | 7 ++----- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c index 620d4917ebe8a6..b835c80371cd28 100644 --- a/arch/powerpc/mm/ptdump/ptdump.c +++ b/arch/powerpc/mm/ptdump/ptdump.c @@ -184,13 +184,14 @@ static void note_prot_wx(struct pg_state *st, unsigned long addr) { pte_t pte = __pte(st->current_flags); - if (!IS_ENABLED(CONFIG_DEBUG_WX) || !st->check_wx) + if (!st->check_wx) return; if (!pte_write(pte) || !pte_exec(pte)) return; - WARN_ONCE(1, "powerpc/mm: Found insecure W+X mapping at address %p/%pS\n", + WARN_ONCE(IS_ENABLED(CONFIG_DEBUG_WX), + "powerpc/mm: Found insecure W+X mapping at address %p/%pS\n", (void *)st->start_address, (void *)st->start_address); st->wx_pages += (addr - st->start_address) / PAGE_SIZE; @@ -326,7 +327,6 @@ static void __init build_pgtable_complete_mask(void) pg_level[i].mask |= pg_level[i].flag[j].mask; } -#ifdef CONFIG_DEBUG_WX void ptdump_check_wx(void) { struct pg_state st = { @@ -354,7 +354,6 @@ void ptdump_check_wx(void) else pr_info("Checked W+X mappings: passed, no W+X pages found\n"); } -#endif static int __init ptdump_init(void) { diff --git a/arch/s390/mm/dump_pagetables.c b/arch/s390/mm/dump_pagetables.c index 8dcb4e0c71bde6..99da5a5602a8ae 100644 --- a/arch/s390/mm/dump_pagetables.c +++ b/arch/s390/mm/dump_pagetables.c @@ -121,7 +121,6 @@ static void print_prot(struct seq_file *m, unsigned int pr, int level) static void note_prot_wx(struct pg_state *st, unsigned long addr) { -#ifdef CONFIG_DEBUG_WX if (!st->check_wx) return; if (st->current_prot & _PAGE_INVALID) @@ -138,10 +137,10 @@ static void note_prot_wx(struct pg_state *st, unsigned long addr) */ if (addr == PAGE_SIZE && (nospec_uses_trampoline() || !static_key_enabled(&cpu_has_bear))) return; - WARN_ONCE(1, "s390/mm: Found insecure W+X mapping at address %pS\n", + WARN_ONCE(IS_ENABLED(CONFIG_DEBUG_WX), + "s390/mm: Found insecure W+X mapping at address %pS\n", (void *)st->start_address); st->wx_pages += (addr - st->start_address) / PAGE_SIZE; -#endif /* CONFIG_DEBUG_WX */ } static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, u64 val) @@ -193,7 +192,6 @@ static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, } } -#ifdef CONFIG_DEBUG_WX void ptdump_check_wx(void) { struct pg_state st = { @@ -226,7 +224,6 @@ void ptdump_check_wx(void) (nospec_uses_trampoline() || !static_key_enabled(&cpu_has_bear)) ? "unexpected " : ""); } -#endif /* CONFIG_DEBUG_WX */ #ifdef CONFIG_PTDUMP_DEBUGFS static int ptdump_show(struct seq_file *m, void *v) From 7282bf6c3c75d9d06894a87e9e7783ada7e80467 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 30 Jan 2024 11:34:35 +0100 Subject: [PATCH 558/707] mm: ptdump: have ptdump_check_wx() return bool Have ptdump_check_wx() return true when the check is successful or false otherwise. Link: https://lkml.kernel.org/r/7943149fe955458cb7b57cd483bf41a3aad94684.1706610398.git.christophe.leroy@csgroup.eu Signed-off-by: Christophe Leroy Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Andy Lutomirski Cc: "Aneesh Kumar K.V (IBM)" Cc: Borislav Petkov (AMD) Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Dave Hansen Cc: Gerald Schaefer Cc: Greg KH Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Kees Cook Cc: Mark Rutland Cc: Michael Ellerman Cc: "Naveen N. Rao" Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Phong Tran Cc: Russell King Cc: Steven Price Cc: Sven Schnelle Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/arm64/mm/ptdump.c | 11 ++++++++--- arch/powerpc/mm/ptdump/ptdump.c | 13 +++++++++---- arch/riscv/mm/ptdump.c | 11 ++++++++--- arch/s390/mm/dump_pagetables.c | 13 +++++++++---- arch/x86/include/asm/pgtable.h | 2 +- arch/x86/mm/dump_pagetables.c | 19 ++++++++++++------- include/linux/ptdump.h | 2 +- 7 files changed, 48 insertions(+), 23 deletions(-) diff --git a/arch/arm64/mm/ptdump.c b/arch/arm64/mm/ptdump.c index e305b6593c4e23..696822f755827e 100644 --- a/arch/arm64/mm/ptdump.c +++ b/arch/arm64/mm/ptdump.c @@ -345,7 +345,7 @@ static struct ptdump_info kernel_ptdump_info = { .base_addr = PAGE_OFFSET, }; -void ptdump_check_wx(void) +bool ptdump_check_wx(void) { struct pg_state st = { .seq = NULL, @@ -366,11 +366,16 @@ void ptdump_check_wx(void) ptdump_walk_pgd(&st.ptdump, &init_mm, NULL); - if (st.wx_pages || st.uxn_pages) + if (st.wx_pages || st.uxn_pages) { pr_warn("Checked W+X mappings: FAILED, %lu W+X pages found, %lu non-UXN pages found\n", st.wx_pages, st.uxn_pages); - else + + return false; + } else { pr_info("Checked W+X mappings: passed, no W+X pages found\n"); + + return true; + } } static int __init ptdump_init(void) diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c index b835c80371cd28..9dc239967b77f7 100644 --- a/arch/powerpc/mm/ptdump/ptdump.c +++ b/arch/powerpc/mm/ptdump/ptdump.c @@ -327,7 +327,7 @@ static void __init build_pgtable_complete_mask(void) pg_level[i].mask |= pg_level[i].flag[j].mask; } -void ptdump_check_wx(void) +bool ptdump_check_wx(void) { struct pg_state st = { .seq = NULL, @@ -344,15 +344,20 @@ void ptdump_check_wx(void) }; if (IS_ENABLED(CONFIG_PPC_BOOK3S_64) && !mmu_has_feature(MMU_FTR_KERNEL_RO)) - return; + return true; ptdump_walk_pgd(&st.ptdump, &init_mm, NULL); - if (st.wx_pages) + if (st.wx_pages) { pr_warn("Checked W+X mappings: FAILED, %lu W+X pages found\n", st.wx_pages); - else + + return false; + } else { pr_info("Checked W+X mappings: passed, no W+X pages found\n"); + + return true; + } } static int __init ptdump_init(void) diff --git a/arch/riscv/mm/ptdump.c b/arch/riscv/mm/ptdump.c index 07526560331366..1289cc6d3700cd 100644 --- a/arch/riscv/mm/ptdump.c +++ b/arch/riscv/mm/ptdump.c @@ -335,7 +335,7 @@ static void ptdump_walk(struct seq_file *s, struct ptd_mm_info *pinfo) ptdump_walk_pgd(&st.ptdump, pinfo->mm, NULL); } -void ptdump_check_wx(void) +bool ptdump_check_wx(void) { struct pg_state st = { .seq = NULL, @@ -356,11 +356,16 @@ void ptdump_check_wx(void) ptdump_walk_pgd(&st.ptdump, &init_mm, NULL); - if (st.wx_pages) + if (st.wx_pages) { pr_warn("Checked W+X mappings: failed, %lu W+X pages found\n", st.wx_pages); - else + + return false; + } else { pr_info("Checked W+X mappings: passed, no W+X pages found\n"); + + return true; + } } static int ptdump_show(struct seq_file *m, void *v) diff --git a/arch/s390/mm/dump_pagetables.c b/arch/s390/mm/dump_pagetables.c index 99da5a5602a8ae..ffd07ed7b4af88 100644 --- a/arch/s390/mm/dump_pagetables.c +++ b/arch/s390/mm/dump_pagetables.c @@ -192,7 +192,7 @@ static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, } } -void ptdump_check_wx(void) +bool ptdump_check_wx(void) { struct pg_state st = { .ptdump = { @@ -215,14 +215,19 @@ void ptdump_check_wx(void) }; if (!MACHINE_HAS_NX) - return; + return true; ptdump_walk_pgd(&st.ptdump, &init_mm, NULL); - if (st.wx_pages) + if (st.wx_pages) { pr_warn("Checked W+X mappings: FAILED, %lu W+X pages found\n", st.wx_pages); - else + + return false; + } else { pr_info("Checked W+X mappings: passed, no %sW+X pages found\n", (nospec_uses_trampoline() || !static_key_enabled(&cpu_has_bear)) ? "unexpected " : ""); + + return true; + } } #ifdef CONFIG_PTDUMP_DEBUGFS diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 6c979028e5212f..b50b2ef63672f4 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -31,7 +31,7 @@ struct seq_file; void ptdump_walk_pgd_level(struct seq_file *m, struct mm_struct *mm); void ptdump_walk_pgd_level_debugfs(struct seq_file *m, struct mm_struct *mm, bool user); -void ptdump_walk_pgd_level_checkwx(void); +bool ptdump_walk_pgd_level_checkwx(void); #define ptdump_check_wx ptdump_walk_pgd_level_checkwx void ptdump_walk_user_pgd_level_checkwx(void); diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index 0008524eebe9af..c58c01f560fd87 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -362,7 +362,7 @@ static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, } } -static void ptdump_walk_pgd_level_core(struct seq_file *m, +bool void ptdump_walk_pgd_level_core(struct seq_file *m, struct mm_struct *mm, pgd_t *pgd, bool checkwx, bool dmesg) { @@ -391,12 +391,17 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, ptdump_walk_pgd(&st.ptdump, mm, pgd); if (!checkwx) - return; - if (st.wx_pages) + return true; + if (st.wx_pages) { pr_info("x86/mm: Checked W+X mappings: FAILED, %lu W+X pages found.\n", st.wx_pages); - else + + return false; + } else { pr_info("x86/mm: Checked W+X mappings: passed, no W+X pages found.\n"); + + return true; + } } void ptdump_walk_pgd_level(struct seq_file *m, struct mm_struct *mm) @@ -431,12 +436,12 @@ void ptdump_walk_user_pgd_level_checkwx(void) #endif } -void ptdump_walk_pgd_level_checkwx(void) +bool ptdump_walk_pgd_level_checkwx(void) { if (!(__supported_pte_mask & _PAGE_NX)) - return; + return true; - ptdump_walk_pgd_level_core(NULL, &init_mm, INIT_PGD, true, false); + return ptdump_walk_pgd_level_core(NULL, &init_mm, INIT_PGD, true, false); } static int __init pt_dump_init(void) diff --git a/include/linux/ptdump.h b/include/linux/ptdump.h index c10513739bf951..953b61696ccf7b 100644 --- a/include/linux/ptdump.h +++ b/include/linux/ptdump.h @@ -19,7 +19,7 @@ struct ptdump_state { }; void ptdump_walk_pgd(struct ptdump_state *st, struct mm_struct *mm, pgd_t *pgd); -void ptdump_check_wx(void); +bool ptdump_check_wx(void); static inline void debug_checkwx(void) { From 24976a6ac9176a30e731cc0c2cd790d3336dd75c Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 31 Jan 2024 01:13:01 -0800 Subject: [PATCH 559/707] mm-ptdump-have-ptdump_check_wx-return-bool-fix fix a couple of build issues (x86_64 allmodconfig) Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Andy Lutomirski Cc: "Aneesh Kumar K.V (IBM)" Cc: Borislav Petkov (AMD) Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Dave Hansen Cc: Gerald Schaefer Cc: Greg KH Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Kees Cook Cc: Mark Rutland Cc: Michael Ellerman Cc: "Naveen N. Rao" Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Phong Tran Cc: Russell King Cc: Steven Price Cc: Sven Schnelle Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/x86/mm/dump_pagetables.c | 6 +++--- include/linux/ptdump.h | 3 +++ 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index c58c01f560fd87..35b2cfd4791418 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -362,9 +362,9 @@ static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, } } -bool void ptdump_walk_pgd_level_core(struct seq_file *m, - struct mm_struct *mm, pgd_t *pgd, - bool checkwx, bool dmesg) +bool ptdump_walk_pgd_level_core(struct seq_file *m, + struct mm_struct *mm, pgd_t *pgd, + bool checkwx, bool dmesg) { const struct ptdump_range ptdump_ranges[] = { #ifdef CONFIG_X86_64 diff --git a/include/linux/ptdump.h b/include/linux/ptdump.h index 953b61696ccf7b..8dbd51ea862678 100644 --- a/include/linux/ptdump.h +++ b/include/linux/ptdump.h @@ -18,6 +18,9 @@ struct ptdump_state { const struct ptdump_range *range; }; +bool ptdump_walk_pgd_level_core(struct seq_file *m, + struct mm_struct *mm, pgd_t *pgd, + bool checkwx, bool dmesg); void ptdump_walk_pgd(struct ptdump_state *st, struct mm_struct *mm, pgd_t *pgd); bool ptdump_check_wx(void); From b654b75b3060bc5ce65cda6fa110fc6a321de10c Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 30 Jan 2024 11:34:36 +0100 Subject: [PATCH 560/707] mm: ptdump: add check_wx_pages debugfs attribute Add a readable attribute in debugfs to trigger a W^X pages check at any time. To trigger the test, just read /sys/kernel/debug/check_wx_pages It will report FAILED if the test failed, SUCCESS otherwise. Detailed result is provided into dmesg. Link: https://lkml.kernel.org/r/e947fb1a9f3f5466344823e532d343ff194ae03d.1706610398.git.christophe.leroy@csgroup.eu Signed-off-by: Christophe Leroy Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Andy Lutomirski Cc: "Aneesh Kumar K.V (IBM)" Cc: Borislav Petkov (AMD) Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Dave Hansen Cc: Gerald Schaefer Cc: Greg KH Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Kees Cook Cc: Mark Rutland Cc: Michael Ellerman Cc: "Naveen N. Rao" Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Phong Tran Cc: Russell King Cc: Steven Price Cc: Sven Schnelle Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Will Deacon Signed-off-by: Andrew Morton --- mm/ptdump.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/mm/ptdump.c b/mm/ptdump.c index 03c1bdae4a4368..106e1d66e9f9ee 100644 --- a/mm/ptdump.c +++ b/mm/ptdump.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include +#include #include #include @@ -163,3 +164,24 @@ void ptdump_walk_pgd(struct ptdump_state *st, struct mm_struct *mm, pgd_t *pgd) /* Flush out the last page */ st->note_page(st, 0, -1, 0); } + +static int check_wx_show(struct seq_file *m, void *v) +{ + if (ptdump_check_wx()) + seq_puts(m, "SUCCESS\n"); + else + seq_puts(m, "FAILED\n"); + + return 0; +} + +DEFINE_SHOW_ATTRIBUTE(check_wx); + +static int ptdump_debugfs_init(void) +{ + debugfs_create_file("check_wx_pages", 0400, NULL, NULL, &check_wx_fops); + + return 0; +} + +device_initcall(ptdump_debugfs_init); From 2bcb578627a37b56e2fa2099be7bcd46f115bd5b Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Mon, 29 Jan 2024 10:03:04 +0800 Subject: [PATCH 561/707] modules: wait do_free_init correctly commit 1a7b7d922081 ("modules: Use vmalloc special flag") moves do_free_init() into a global workqueue instead of call_rcu(). So now rcu_barrier() can not ensure that do_free_init has completed. We should wait it via flush_work(). Without this fix, we still could encounter false positive reports in W+X checking, and rcu synchronization is unnecessary. Link: https://lkml.kernel.org/r/20240129020304.1981372-1-changbin.du@huawei.com Fixes: 1a7b7d922081 ("modules: Use vmalloc special flag") Signed-off-by: Changbin Du Cc: Xiaoyi Su Cc: Luis Chamberlain Signed-off-by: Andrew Morton --- include/linux/moduleloader.h | 8 ++++++++ init/main.c | 5 +++-- kernel/module/main.c | 5 +++++ 3 files changed, 16 insertions(+), 2 deletions(-) diff --git a/include/linux/moduleloader.h b/include/linux/moduleloader.h index 001b2ce83832ed..89b1e0ed981144 100644 --- a/include/linux/moduleloader.h +++ b/include/linux/moduleloader.h @@ -115,6 +115,14 @@ int module_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, struct module *mod); +#ifdef CONFIG_MODULES +void flush_module_init_free_work(void); +#else +static inline void flush_module_init_free_work(void) +{ +} +#endif + /* Any cleanup needed when module leaves. */ void module_arch_cleanup(struct module *mod); diff --git a/init/main.c b/init/main.c index 749a9f8d2c9b0d..504d417ab9f0f3 100644 --- a/init/main.c +++ b/init/main.c @@ -87,6 +87,7 @@ #include #include #include +#include #include #include #include @@ -1403,11 +1404,11 @@ static void mark_readonly(void) if (rodata_enabled) { /* * load_module() results in W+X mappings, which are cleaned - * up with call_rcu(). Let's make sure that queued work is + * up with init_free_wq. Let's make sure that queued work is * flushed so that we don't hit false positives looking for * insecure pages which are W+X. */ - rcu_barrier(); + flush_module_init_free_work(); mark_rodata_ro(); debug_checkwx(); rodata_test(); diff --git a/kernel/module/main.c b/kernel/module/main.c index 36681911c05acd..ea66b5c2a2a157 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -2489,6 +2489,11 @@ static void do_free_init(struct work_struct *w) } } +void flush_module_init_free_work(void) +{ + flush_work(&init_free_wq); +} + #undef MODULE_PARAM_PREFIX #define MODULE_PARAM_PREFIX "module." /* Default value for module->async_probe_requested */ From c387d996faff9bf2d105dfbfef337da82bff6c5c Mon Sep 17 00:00:00 2001 From: Zhaoyang Huang Date: Thu, 11 May 2023 13:22:30 +0800 Subject: [PATCH 562/707] mm: optimization on page allocation when CMA enabled According to current CMA utilization policy, an alloc_pages(GFP_USER) could 'steal' UNMOVABLE & RECLAIMABLE page blocks via the help of CMA(pass zone_watermark_ok by counting CMA in but use U&R in rmqueue), which could lead to following alloc_pages(GFP_KERNEL) fail. Solving this by introducing second watermark checking for GFP_MOVABLE, which could have the allocation use CMA when proper. -- Free_pages(30MB) | | -- WMARK_LOW(25MB) | -- Free_CMA(12MB) | | -- Link: https://lkml.kernel.org/r/20231016071245.2865233-1-zhaoyang.huang@unisoc.com Link: https://lkml.kernel.org/r/1683782550-25799-1-git-send-email-zhaoyang.huang@unisoc.com Signed-off-by: Zhaoyang Huang Cc: Joonsoo Kim Cc: ke.wang Cc: Minchan Kim Cc: Roman Gushchin Cc: Zhaoyang Huang Signed-off-by: Andrew Morton --- mm/page_alloc.c | 44 ++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 40 insertions(+), 4 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 140c4f372db169..e6e2ac722a82fd 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2084,6 +2084,43 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype, } +#ifdef CONFIG_CMA +/* + * GFP_MOVABLE allocation could drain UNMOVABLE & RECLAIMABLE page blocks via + * the help of CMA which makes GFP_KERNEL failed. Checking if zone_watermark_ok + * again without ALLOC_CMA to see if to use CMA first. + */ +static bool use_cma_first(struct zone *zone, unsigned int order, unsigned int alloc_flags) +{ + unsigned long watermark; + bool cma_first = false; + + watermark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK); + /* check if GFP_MOVABLE pass previous zone_watermark_ok via the help of CMA */ + if (zone_watermark_ok(zone, order, watermark, 0, alloc_flags & (~ALLOC_CMA))) { + /* + * Balance movable allocations between regular and CMA areas by + * allocating from CMA when over half of the zone's free memory + * is in the CMA area. + */ + cma_first = (zone_page_state(zone, NR_FREE_CMA_PAGES) > + zone_page_state(zone, NR_FREE_PAGES) / 2); + } else { + /* + * watermark failed means UNMOVABLE & RECLAIMBLE is not enough + * now, we should use cma first to keep them stay around the + * corresponding watermark + */ + cma_first = true; + } + return cma_first; +} +#else +static bool use_cma_first(struct zone *zone, unsigned int order, unsigned int alloc_flags) +{ + return false; +} +#endif /* * Do the hard work of removing an element from the buddy allocator. * Call me with the zone->lock already held. @@ -2097,12 +2134,11 @@ __rmqueue(struct zone *zone, unsigned int order, int migratetype, if (IS_ENABLED(CONFIG_CMA)) { /* * Balance movable allocations between regular and CMA areas by - * allocating from CMA when over half of the zone's free memory - * is in the CMA area. + * allocating from CMA base on judging zone_watermark_ok again + * to see if the latest check got pass via the help of CMA */ if (alloc_flags & ALLOC_CMA && - zone_page_state(zone, NR_FREE_CMA_PAGES) > - zone_page_state(zone, NR_FREE_PAGES) / 2) { + use_cma_first(zone, order, alloc_flags)) { page = __rmqueue_cma_fallback(zone, order); if (page) return page; From 196694dc994059d47ed08c762981020ded1700e5 Mon Sep 17 00:00:00 2001 From: Dan Schatzberg Date: Wed, 3 Jan 2024 08:48:36 -0800 Subject: [PATCH 563/707] mm: add defines for min/max swappiness Patch series "Add swappiness argument to memory.reclaim", v6. This patch proposes augmenting the memory.reclaim interface with a swappiness= argument that overrides the swappiness value for that instance of proactive reclaim. Userspace proactive reclaimers use the memory.reclaim interface to trigger reclaim. The memory.reclaim interface does not allow for any way to effect the balance of file vs anon during proactive reclaim. The only approach is to adjust the vm.swappiness setting. However, there are a few reasons we look to control the balance of file vs anon during proactive reclaim, separately from reactive reclaim: * Swapout should be limited to manage SSD write endurance. In near-OOM situations we are fine with lots of swap-out to avoid OOMs. As these are typically rare events, they have relatively little impact on write endurance. However, proactive reclaim runs continuously and so its impact on SSD write endurance is more significant. Therefore it is desireable to control swap-out for proactive reclaim separately from reactive reclaim * Some userspace OOM killers like systemd-oomd[1] support OOM killing on swap exhaustion. This makes sense if the swap exhaustion is triggered due to reactive reclaim but less so if it is triggered due to proactive reclaim (e.g. one could see OOMs when free memory is ample but anon is just particularly cold). Therefore, it's desireable to have proactive reclaim reduce or stop swap-out before the threshold at which OOM killing occurs. In the case of Meta's Senpai proactive reclaimer, we adjust vm.swappiness before writes to memory.reclaim[2]. This has been in production for nearly two years and has addressed our needs to control proactive vs reactive reclaim behavior but is still not ideal for a number of reasons: * vm.swappiness is a global setting, adjusting it can race/interfere with other system administration that wishes to control vm.swappiness. In our case, we need to disable Senpai before adjusting vm.swappiness. * vm.swappiness is stateful - so a crash or restart of Senpai can leave a misconfigured setting. This requires some additional management to record the "desired" setting and ensure Senpai always adjusts to it. With this patch, we avoid these downsides of adjusting vm.swappiness globally. Previously, this exact interface addition was proposed by Yosry[3]. In response, Roman proposed instead an interface to specify precise file/anon/slab reclaim amounts[4]. More recently Huan also proposed this as well[5] and others similarly questioned if this was the proper interface. Previous proposals sought to use this to allow proactive reclaimers to effectively perform a custom reclaim algorithm by issuing proactive reclaim with different settings to control file vs anon reclaim (e.g. to only reclaim anon from some applications). Responses argued that adjusting swappiness is a poor interface for custom reclaim. In contrast, I argue in favor of a swappiness setting not as a way to implement custom reclaim algorithms but rather to bias the balance of anon vs file due to differences of proactive vs reactive reclaim. In this context, swappiness is the existing interface for controlling this balance and this patch simply allows for it to be configured differently for proactive vs reactive reclaim. Specifying explicit amounts of anon vs file pages to reclaim feels inappropriate for this prupose. Proactive reclaimers are un-aware of the relative age of file vs anon for a cgroup which makes it difficult to manage proactive reclaim of different memory pools. A proactive reclaimer would need some amount of anon reclaim attempts separate from the amount of file reclaim attempts which seems brittle given that it's difficult to observe the impact. [1]https://www.freedesktop.org/software/systemd/man/latest/systemd-oomd.service.html [2]https://github.com/facebookincubator/oomd/blob/main/src/oomd/plugins/Senpai.cpp#L585-L598 [3]https://lore.kernel.org/linux-mm/CAJD7tkbDpyoODveCsnaqBBMZEkDvshXJmNdbk51yKSNgD7aGdg@mail.gmail.com/ [4]https://lore.kernel.org/linux-mm/YoPHtHXzpK51F%2F1Z@carbon/ [5]https://lore.kernel.org/lkml/20231108065818.19932-1-link@vivo.com/ This patch (of 2): We use the constants 0 and 200 in a few places in the mm code when referring to the min and max swappiness. This patch adds MIN_SWAPPINESS and MAX_SWAPPINESS #defines to improve clarity. There are no functional changes. Link: https://lkml.kernel.org/r/20240103164841.2800183-1-schatzberg.dan@gmail.com Link: https://lkml.kernel.org/r/20240103164841.2800183-2-schatzberg.dan@gmail.com Signed-off-by: Dan Schatzberg Acked-by: David Rientjes Acked-by: Chris Li Reviewed-by: Nhat Pham Cc: David Hildenbrand Cc: Hugh Dickins Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Kefeng Wang Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Cc: Shakeel Butt Cc: Tejun Heo Cc: Yosry Ahmed Cc: Yue Zhao Cc: Zefan Li Signed-off-by: Andrew Morton --- include/linux/swap.h | 2 ++ mm/memcontrol.c | 2 +- mm/vmscan.c | 14 +++++++------- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index 4db00ddad26169..d210a5dc43013c 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -404,6 +404,8 @@ extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, #define MEMCG_RECLAIM_MAY_SWAP (1 << 1) #define MEMCG_RECLAIM_PROACTIVE (1 << 2) +#define MIN_SWAPPINESS 0 +#define MAX_SWAPPINESS 200 extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, unsigned long nr_pages, gfp_t gfp_mask, diff --git a/mm/memcontrol.c b/mm/memcontrol.c index eb8684269906e9..dcfe5189e3be60 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4384,7 +4384,7 @@ static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css, { struct mem_cgroup *memcg = mem_cgroup_from_css(css); - if (val > 200) + if (val > MAX_SWAPPINESS) return -EINVAL; if (!mem_cgroup_is_root(memcg)) diff --git a/mm/vmscan.c b/mm/vmscan.c index 1f139830b26f6c..44b92fb3d09775 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -183,7 +183,7 @@ struct scan_control { #endif /* - * From 0 .. 200. Higher means more swappy. + * From 0 .. MAX_SWAPPINESS. Higher means more swappy. */ int vm_swappiness = 60; @@ -2404,7 +2404,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, ap = swappiness * (total_cost + 1); ap /= anon_cost + 1; - fp = (200 - swappiness) * (total_cost + 1); + fp = (MAX_SWAPPINESS - swappiness) * (total_cost + 1); fp /= file_cost + 1; fraction[0] = ap; @@ -4422,7 +4422,7 @@ static int get_type_to_scan(struct lruvec *lruvec, int swappiness, int *tier_idx { int type, tier; struct ctrl_pos sp, pv; - int gain[ANON_AND_FILE] = { swappiness, 200 - swappiness }; + int gain[ANON_AND_FILE] = { swappiness, MAX_SWAPPINESS - swappiness }; /* * Compare the first tier of anon with that of file to determine which @@ -4458,7 +4458,7 @@ static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int sw /* * Try to make the obvious choice first. When anon and file are both * available from the same generation, interpret swappiness 1 as file - * first and 200 as anon first. + * first and MAX_SWAPPINESS as anon first. */ if (!swappiness) type = LRU_GEN_FILE; @@ -4466,7 +4466,7 @@ static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int sw type = LRU_GEN_ANON; else if (swappiness == 1) type = LRU_GEN_FILE; - else if (swappiness == 200) + else if (swappiness == MAX_SWAPPINESS) type = LRU_GEN_ANON; else type = get_type_to_scan(lruvec, swappiness, &tier); @@ -5408,9 +5408,9 @@ static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq, lruvec = get_lruvec(memcg, nid); - if (swappiness < 0) + if (swappiness < MIN_SWAPPINESS) swappiness = get_swappiness(lruvec, sc); - else if (swappiness > 200) + else if (swappiness > MAX_SWAPPINESS) goto done; switch (cmd) { From 3a92c45e4ba694381c46994f3fde0d8544a2088b Mon Sep 17 00:00:00 2001 From: Dan Schatzberg Date: Wed, 3 Jan 2024 08:48:37 -0800 Subject: [PATCH 564/707] mm: add swappiness= arg to memory.reclaim Allow proactive reclaimers to submit an additional swappiness= argument to memory.reclaim. This overrides the global or per-memcg swappiness setting for that reclaim attempt. For example: echo "2M swappiness=0" > /sys/fs/cgroup/memory.reclaim will perform reclaim on the rootcg with a swappiness setting of 0 (no swap) regardless of the vm.swappiness sysctl setting. Userspace proactive reclaimers use the memory.reclaim interface to trigger reclaim. The memory.reclaim interface does not allow for any way to effect the balance of file vs anon during proactive reclaim. The only approach is to adjust the vm.swappiness setting. However, there are a few reasons we look to control the balance of file vs anon during proactive reclaim, separately from reactive reclaim: * Swapout should be limited to manage SSD write endurance. In near-OOM situations we are fine with lots of swap-out to avoid OOMs. As these are typically rare events, they have relatively little impact on write endurance. However, proactive reclaim runs continuously and so its impact on SSD write endurance is more significant. Therefore it is desireable to control swap-out for proactive reclaim separately from reactive reclaim * Some userspace OOM killers like systemd-oomd[1] support OOM killing on swap exhaustion. This makes sense if the swap exhaustion is triggered due to reactive reclaim but less so if it is triggered due to proactive reclaim (e.g. one could see OOMs when free memory is ample but anon is just particularly cold). Therefore, it's desireable to have proactive reclaim reduce or stop swap-out before the threshold at which OOM killing occurs. In the case of Meta's Senpai proactive reclaimer, we adjust vm.swappiness before writes to memory.reclaim[2]. This has been in production for nearly two years and has addressed our needs to control proactive vs reactive reclaim behavior but is still not ideal for a number of reasons: * vm.swappiness is a global setting, adjusting it can race/interfere with other system administration that wishes to control vm.swappiness. In our case, we need to disable Senpai before adjusting vm.swappiness. * vm.swappiness is stateful - so a crash or restart of Senpai can leave a misconfigured setting. This requires some additional management to record the "desired" setting and ensure Senpai always adjusts to it. With this patch, we avoid these downsides of adjusting vm.swappiness globally. [1]https://www.freedesktop.org/software/systemd/man/latest/systemd-oomd.service.html [2]https://github.com/facebookincubator/oomd/blob/main/src/oomd/plugins/Senpai.cpp#L585-L598 Link: https://lkml.kernel.org/r/20240103164841.2800183-3-schatzberg.dan@gmail.com Signed-off-by: Dan Schatzberg Suggested-by: Yosry Ahmed Acked-by: Michal Hocko Acked-by: David Rientjes Acked-by: Chris Li Cc: David Hildenbrand Cc: Hugh Dickins Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Kefeng Wang Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Cc: Roman Gushchin Cc: Shakeel Butt Cc: Tejun Heo Cc: Yue Zhao Cc: Zefan Li Cc: Nhat Pham Signed-off-by: Andrew Morton --- Documentation/admin-guide/cgroup-v2.rst | 18 ++++---- include/linux/swap.h | 3 +- mm/memcontrol.c | 56 ++++++++++++++++++++----- mm/vmscan.c | 25 +++++++++-- 4 files changed, 80 insertions(+), 22 deletions(-) diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 17e6e956515640..0270517ade47cf 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1296,17 +1296,10 @@ PAGE_SIZE multiple when read back. This is a simple interface to trigger memory reclaim in the target cgroup. - This file accepts a single key, the number of bytes to reclaim. - No nested keys are currently supported. - Example:: echo "1G" > memory.reclaim - The interface can be later extended with nested keys to - configure the reclaim behavior. For example, specify the - type of memory to reclaim from (anon, file, ..). - Please note that the kernel can over or under reclaim from the target cgroup. If less bytes are reclaimed than the specified amount, -EAGAIN is returned. @@ -1318,6 +1311,17 @@ PAGE_SIZE multiple when read back. This means that the networking layer will not adapt based on reclaim induced by memory.reclaim. +The following nested keys are defined. + + ========== ================================ + swappiness Swappiness value to reclaim with + ========== ================================ + + Specifying a swappiness value instructs the kernel to perform + the reclaim with that swappiness value. Note that this has the + same semantics as vm.swappiness applied to memcg reclaim with + all the existing limitations and potential future extensions. + memory.peak A read-only single value file which exists on non-root cgroups. diff --git a/include/linux/swap.h b/include/linux/swap.h index d210a5dc43013c..41e4b484bc346d 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -409,7 +409,8 @@ extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, unsigned long nr_pages, gfp_t gfp_mask, - unsigned int reclaim_options); + unsigned int reclaim_options, + int *swappiness); extern unsigned long mem_cgroup_shrink_node(struct mem_cgroup *mem, gfp_t gfp_mask, bool noswap, pg_data_t *pgdat, diff --git a/mm/memcontrol.c b/mm/memcontrol.c index dcfe5189e3be60..9ae6a11ecbf2ed 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -52,6 +52,7 @@ #include #include #include +#include #include #include #include @@ -2475,7 +2476,8 @@ static unsigned long reclaim_high(struct mem_cgroup *memcg, psi_memstall_enter(&pflags); nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, - MEMCG_RECLAIM_MAY_SWAP); + MEMCG_RECLAIM_MAY_SWAP, + NULL); psi_memstall_leave(&pflags); } while ((memcg = parent_mem_cgroup(memcg)) && !mem_cgroup_is_root(memcg)); @@ -2781,7 +2783,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask, psi_memstall_enter(&pflags); nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages, - gfp_mask, reclaim_options); + gfp_mask, reclaim_options, NULL); psi_memstall_leave(&pflags); if (mem_cgroup_margin(mem_over_limit) >= nr_pages) @@ -3707,7 +3709,7 @@ static int mem_cgroup_resize_max(struct mem_cgroup *memcg, } if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, - memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP)) { + memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP, NULL)) { ret = -EBUSY; break; } @@ -3821,7 +3823,7 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg) return -EINTR; if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, - MEMCG_RECLAIM_MAY_SWAP)) + MEMCG_RECLAIM_MAY_SWAP, NULL)) nr_retries--; } @@ -6787,7 +6789,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of, } reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high, - GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP); + GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP, NULL); if (!reclaimed && !nr_retries--) break; @@ -6836,7 +6838,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of, if (nr_reclaims) { if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max, - GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP)) + GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP, NULL)) nr_reclaims--; continue; } @@ -6966,19 +6968,50 @@ static ssize_t memory_oom_group_write(struct kernfs_open_file *of, return nbytes; } +enum { + MEMORY_RECLAIM_SWAPPINESS = 0, + MEMORY_RECLAIM_NULL, +}; + +static const match_table_t tokens = { + { MEMORY_RECLAIM_SWAPPINESS, "swappiness=%d"}, + { MEMORY_RECLAIM_NULL, NULL }, +}; + static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); unsigned int nr_retries = MAX_RECLAIM_RETRIES; unsigned long nr_to_reclaim, nr_reclaimed = 0; + int swappiness = -1; unsigned int reclaim_options; - int err; + char *old_buf, *start; + substring_t args[MAX_OPT_ARGS]; buf = strstrip(buf); - err = page_counter_memparse(buf, "", &nr_to_reclaim); - if (err) - return err; + + old_buf = buf; + nr_to_reclaim = memparse(buf, &buf) / PAGE_SIZE; + if (buf == old_buf) + return -EINVAL; + + buf = strstrip(buf); + + while ((start = strsep(&buf, " ")) != NULL) { + if (!strlen(start)) + continue; + switch (match_token(start, tokens, args)) { + case MEMORY_RECLAIM_SWAPPINESS: + if (match_int(&args[0], &swappiness)) + return -EINVAL; + if (swappiness < MIN_SWAPPINESS || swappiness > MAX_SWAPPINESS) + return -EINVAL; + break; + default: + return -EINVAL; + } + } reclaim_options = MEMCG_RECLAIM_MAY_SWAP | MEMCG_RECLAIM_PROACTIVE; while (nr_reclaimed < nr_to_reclaim) { @@ -6997,7 +7030,8 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf, reclaimed = try_to_free_mem_cgroup_pages(memcg, min(nr_to_reclaim - nr_reclaimed, SWAP_CLUSTER_MAX), - GFP_KERNEL, reclaim_options); + GFP_KERNEL, reclaim_options, + swappiness == -1 ? NULL : &swappiness); if (!reclaimed && !nr_retries--) return -EAGAIN; diff --git a/mm/vmscan.c b/mm/vmscan.c index 44b92fb3d09775..895f03e2d9327b 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -92,6 +92,11 @@ struct scan_control { unsigned long anon_cost; unsigned long file_cost; +#ifdef CONFIG_MEMCG + /* Swappiness value for proactive reclaim. Always use sc_swappiness()! */ + int *proactive_swappiness; +#endif + /* Can active folios be deactivated as part of reclaim? */ #define DEACTIVATE_ANON 1 #define DEACTIVATE_FILE 2 @@ -227,6 +232,13 @@ static bool writeback_throttling_sane(struct scan_control *sc) #endif return false; } + +static int sc_swappiness(struct scan_control *sc, struct mem_cgroup *memcg) +{ + if (sc->proactive && sc->proactive_swappiness) + return *sc->proactive_swappiness; + return mem_cgroup_swappiness(memcg); +} #else static bool cgroup_reclaim(struct scan_control *sc) { @@ -242,6 +254,11 @@ static bool writeback_throttling_sane(struct scan_control *sc) { return true; } + +static int sc_swappiness(struct scan_control *sc, struct mem_cgroup *memcg) +{ + return READ_ONCE(vm_swappiness); +} #endif static void set_task_reclaim_state(struct task_struct *task, @@ -2328,7 +2345,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, struct pglist_data *pgdat = lruvec_pgdat(lruvec); struct mem_cgroup *memcg = lruvec_memcg(lruvec); unsigned long anon_cost, file_cost, total_cost; - int swappiness = mem_cgroup_swappiness(memcg); + int swappiness = sc_swappiness(sc, memcg); u64 fraction[ANON_AND_FILE]; u64 denominator = 0; /* gcc */ enum scan_balance scan_balance; @@ -2609,7 +2626,7 @@ static int get_swappiness(struct lruvec *lruvec, struct scan_control *sc) mem_cgroup_get_nr_swap_pages(memcg) < MIN_LRU_BATCH) return 0; - return mem_cgroup_swappiness(memcg); + return sc_swappiness(sc, memcg); } static int get_nr_gens(struct lruvec *lruvec, int type) @@ -6488,12 +6505,14 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg, unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, unsigned long nr_pages, gfp_t gfp_mask, - unsigned int reclaim_options) + unsigned int reclaim_options, + int *swappiness) { unsigned long nr_reclaimed; unsigned int noreclaim_flag; struct scan_control sc = { .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), + .proactive_swappiness = swappiness, .gfp_mask = (current_gfp_context(gfp_mask) & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), .reclaim_idx = MAX_NR_ZONES - 1, From fd8a03639636bd0b544b4108f1c1b216cce87156 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 10 Oct 2023 15:55:49 +0100 Subject: [PATCH 565/707] bounds: support non-power-of-two CONFIG_NR_CPUS ilog2() rounds down, so for example when PowerPC 85xx sets CONFIG_NR_CPUS to 24, we will only allocate 4 bits to store the number of CPUs instead of 5. Use bits_per() instead, which rounds up. Found by code inspection. The effect of this would probably be a misaccounting when doing NUMA balancing, so to a user, it would only be a performance penalty. The effects may be more wide-spread; it's hard to tell. Link: https://lkml.kernel.org/r/20231010145549.1244748-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Fixes: 90572890d202 ("mm: numa: Change page last {nid,pid} into {cpu,pid}") Reviewed-by: Rik van Riel Acked-by: Mel Gorman Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Signed-off-by: Andrew Morton --- kernel/bounds.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bounds.c b/kernel/bounds.c index b529182e8b04fc..c5a9fcd2d62281 100644 --- a/kernel/bounds.c +++ b/kernel/bounds.c @@ -19,7 +19,7 @@ int main(void) DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS); DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES); #ifdef CONFIG_SMP - DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS)); + DEFINE(NR_CPUS_BITS, bits_per(CONFIG_NR_CPUS)); #endif DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t)); #ifdef CONFIG_LRU_GEN From e31a7064b797812bca319e35af6f113d66bf0137 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Tue, 9 Jan 2024 15:16:30 -0700 Subject: [PATCH 566/707] arch and include: update LLVM Phabricator links reviews.llvm.org was LLVM's Phabricator instances for code review. It has been abandoned in favor of GitHub pull requests. While the majority of links in the kernel sources still work because of the work Fangrui has done turning the dynamic Phabricator instance into a static archive, there are some issues with that work, so preemptively convert all the links in the kernel sources to point to the commit on GitHub. Most of the commits have the corresponding differential review link in the commit message itself so there should not be any loss of fidelity in the relevant information. Link: https://discourse.llvm.org/t/update-on-github-pull-requests/71540/172 Link: https://lkml.kernel.org/r/20240109-update-llvm-links-v1-2-eb09b59db071@kernel.org Signed-off-by: Nathan Chancellor Reviewed-by: Conor Dooley Reviewed-by: Kees Cook Acked-by: Fangrui Song Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Daniel Borkmann Cc: Mykola Lysenko Signed-off-by: Andrew Morton --- arch/arm64/Kconfig | 4 ++-- arch/riscv/Kconfig | 2 +- arch/riscv/include/asm/ftrace.h | 2 +- include/linux/compiler-clang.h | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index aa7c1d43513968..5a8acca4dbf495 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -382,7 +382,7 @@ config BROKEN_GAS_INST config BUILTIN_RETURN_ADDRESS_STRIPS_PAC bool # Clang's __builtin_return_adddress() strips the PAC since 12.0.0 - # https://reviews.llvm.org/D75044 + # https://github.com/llvm/llvm-project/commit/2a96f47c5ffca84cd774ad402cacd137f4bf45e2 default y if CC_IS_CLANG && (CLANG_VERSION >= 120000) # GCC's __builtin_return_address() strips the PAC since 11.1.0, # and this was backported to 10.2.0, 9.4.0, 8.5.0, but not earlier @@ -2222,7 +2222,7 @@ config STACKPROTECTOR_PER_TASK config UNWIND_PATCH_PAC_INTO_SCS bool "Enable shadow call stack dynamically using code patching" - # needs Clang with https://reviews.llvm.org/D111780 incorporated + # needs Clang with https://github.com/llvm/llvm-project/commit/de07cde67b5d205d58690be012106022aea6d2b3 incorporated depends on CC_IS_CLANG && CLANG_VERSION >= 150000 depends on ARM64_PTR_AUTH_KERNEL && CC_HAS_BRANCH_PROT_PAC_RET depends on SHADOW_CALL_STACK diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index bffbd869a06828..69d24f51392206 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -312,7 +312,7 @@ config AS_HAS_INSN def_bool $(as-instr,.insn r 51$(comma) 0$(comma) 0$(comma) t0$(comma) t0$(comma) zero) config AS_HAS_OPTION_ARCH - # https://reviews.llvm.org/D123515 + # https://github.com/llvm/llvm-project/commit/9e8ed3403c191ab9c4903e8eeb8f732ff8a43cb4 def_bool y depends on $(as-instr, .option arch$(comma) +m) depends on !$(as-instr, .option arch$(comma) -i) diff --git a/arch/riscv/include/asm/ftrace.h b/arch/riscv/include/asm/ftrace.h index 32917212295234..06874fb1311e5e 100644 --- a/arch/riscv/include/asm/ftrace.h +++ b/arch/riscv/include/asm/ftrace.h @@ -15,7 +15,7 @@ /* * Clang prior to 13 had "mcount" instead of "_mcount": - * https://reviews.llvm.org/D98881 + * https://github.com/llvm/llvm-project/commit/ef58ae86ba778ed7d01cd3f6bd6d08f943abab44 */ #if defined(CONFIG_CC_IS_GCC) || CONFIG_CLANG_VERSION >= 130000 #define MCOUNT_NAME _mcount diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h index ddab1ef22beef3..f0a47afef12581 100644 --- a/include/linux/compiler-clang.h +++ b/include/linux/compiler-clang.h @@ -9,7 +9,7 @@ * Clang prior to 17 is being silly and considers many __cleanup() variables * as unused (because they are, their sole purpose is to go out of scope). * - * https://reviews.llvm.org/D152180 + * https://github.com/llvm/llvm-project/commit/877210faa447f4cc7db87812f8ed80e398fedd61 */ #undef __cleanup #define __cleanup(func) __maybe_unused __attribute__((__cleanup__(func))) From 983fac0ad90f86dcc52496b10c04495b6e20c735 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Tue, 9 Jan 2024 15:16:31 -0700 Subject: [PATCH 567/707] treewide: update LLVM Bugzilla links LLVM moved their issue tracker from their own Bugzilla instance to GitHub issues. While all of the links are still valid, they may not necessarily show the most up to date information around the issues, as all updates will occur on GitHub, not Bugzilla. Another complication is that the Bugzilla issue number is not always the same as the GitHub issue number. Thankfully, LLVM maintains this mapping through two shortlinks: https://llvm.org/bz -> https://bugs.llvm.org/show_bug.cgi?id= https://llvm.org/pr -> https://github.com/llvm/llvm-project/issues/ Switch all "https://bugs.llvm.org/show_bug.cgi?id=" links to the "https://llvm.org/pr" shortlink so that the links show the most up to date information. Each migrated issue links back to the Bugzilla entry, so there should be no loss of fidelity of information here. Link: https://lkml.kernel.org/r/20240109-update-llvm-links-v1-3-eb09b59db071@kernel.org Signed-off-by: Nathan Chancellor Reviewed-by: Kees Cook Acked-by: Fangrui Song Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Daniel Borkmann Cc: Mykola Lysenko Signed-off-by: Andrew Morton --- arch/powerpc/Makefile | 4 ++-- arch/powerpc/kvm/book3s_hv_nested.c | 2 +- arch/s390/include/asm/ftrace.h | 2 +- arch/x86/power/Makefile | 2 +- crypto/blake2b_generic.c | 2 +- drivers/firmware/efi/libstub/Makefile | 2 +- drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 2 +- drivers/media/test-drivers/vicodec/codec-fwht.c | 2 +- drivers/regulator/Kconfig | 2 +- include/asm-generic/vmlinux.lds.h | 2 +- lib/Kconfig.kasan | 2 +- lib/raid6/Makefile | 2 +- lib/stackinit_kunit.c | 2 +- mm/slab_common.c | 2 +- net/bridge/br_multicast.c | 2 +- security/Kconfig | 2 +- 16 files changed, 17 insertions(+), 17 deletions(-) diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile index 051247027da0ba..457cee9b03ee04 100644 --- a/arch/powerpc/Makefile +++ b/arch/powerpc/Makefile @@ -144,11 +144,11 @@ CFLAGS-$(CONFIG_PPC64) += $(call cc-option,-mno-pointers-to-nested-functions) CFLAGS-$(CONFIG_PPC64) += $(call cc-option,-mlong-double-128) # Clang unconditionally reserves r2 on ppc32 and does not support the flag -# https://bugs.llvm.org/show_bug.cgi?id=39555 +# https://llvm.org/pr39555 CFLAGS-$(CONFIG_PPC32) := $(call cc-option, -ffixed-r2) # Clang doesn't support -mmultiple / -mno-multiple -# https://bugs.llvm.org/show_bug.cgi?id=39556 +# https://llvm.org/pr39556 CFLAGS-$(CONFIG_PPC32) += $(call cc-option, $(MULTIPLEWORD)) CFLAGS-$(CONFIG_PPC32) += $(call cc-option,-mno-readonly-in-sdata) diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c index 5c375ec1a3c608..05f5220960c63b 100644 --- a/arch/powerpc/kvm/book3s_hv_nested.c +++ b/arch/powerpc/kvm/book3s_hv_nested.c @@ -55,7 +55,7 @@ void kvmhv_save_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr) hr->dawrx1 = vcpu->arch.dawrx1; } -/* Use noinline_for_stack due to https://bugs.llvm.org/show_bug.cgi?id=49610 */ +/* Use noinline_for_stack due to https://llvm.org/pr49610 */ static noinline_for_stack void byteswap_pt_regs(struct pt_regs *regs) { unsigned long *addr = (unsigned long *) regs; diff --git a/arch/s390/include/asm/ftrace.h b/arch/s390/include/asm/ftrace.h index 5a82b08f03cd3e..621f23d5ae30a6 100644 --- a/arch/s390/include/asm/ftrace.h +++ b/arch/s390/include/asm/ftrace.h @@ -9,7 +9,7 @@ #ifndef __ASSEMBLY__ #ifdef CONFIG_CC_IS_CLANG -/* https://bugs.llvm.org/show_bug.cgi?id=41424 */ +/* https://llvm.org/pr41424 */ #define ftrace_return_address(n) 0UL #else #define ftrace_return_address(n) __builtin_return_address(n) diff --git a/arch/x86/power/Makefile b/arch/x86/power/Makefile index 379777572bc9fe..e0cd7afd53022a 100644 --- a/arch/x86/power/Makefile +++ b/arch/x86/power/Makefile @@ -5,7 +5,7 @@ CFLAGS_cpu.o := -fno-stack-protector # Clang may incorrectly inline functions with stack protector enabled into -# __restore_processor_state(): https://bugs.llvm.org/show_bug.cgi?id=47479 +# __restore_processor_state(): https://llvm.org/pr47479 CFLAGS_REMOVE_cpu.o := $(CC_FLAGS_LTO) obj-$(CONFIG_PM_SLEEP) += cpu.o diff --git a/crypto/blake2b_generic.c b/crypto/blake2b_generic.c index 6704c035588967..32e380b714b6cc 100644 --- a/crypto/blake2b_generic.c +++ b/crypto/blake2b_generic.c @@ -102,7 +102,7 @@ static void blake2b_compress_one_generic(struct blake2b_state *S, ROUND(10); ROUND(11); #ifdef CONFIG_CC_IS_CLANG -#pragma nounroll /* https://bugs.llvm.org/show_bug.cgi?id=45803 */ +#pragma nounroll /* https://llvm.org/pr45803 */ #endif for (i = 0; i < 8; ++i) S->h[i] = S->h[i] ^ v[i] ^ v[i + 8]; diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile index 06964a3c130f6a..a223bd10564b1b 100644 --- a/drivers/firmware/efi/libstub/Makefile +++ b/drivers/firmware/efi/libstub/Makefile @@ -105,7 +105,7 @@ lib-y := $(patsubst %.o,%.stub.o,$(lib-y)) # Even when -mbranch-protection=none is set, Clang will generate a # .note.gnu.property for code-less object files (like lib/ctype.c), # so work around this by explicitly removing the unwanted section. -# https://bugs.llvm.org/show_bug.cgi?id=46480 +# https://llvm.org/pr46480 STUBCOPY_FLAGS-y += --remove-section=.note.gnu.property STUBCOPY_RELOC-$(CONFIG_X86_32) := R_386_32 diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c index 2d688dca26bedb..78a2773b74f2f0 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c @@ -610,7 +610,7 @@ static uint32_t sdma_v4_4_2_rb_cntl(struct amdgpu_ring *ring, uint32_t rb_cntl) /* Set ring buffer size in dwords */ uint32_t rb_bufsz = order_base_2(ring->ring_size / 4); - barrier(); /* work around https://bugs.llvm.org/show_bug.cgi?id=42576 */ + barrier(); /* work around https://llvm.org/pr42576 */ rb_cntl = REG_SET_FIELD(rb_cntl, SDMA_GFX_RB_CNTL, RB_SIZE, rb_bufsz); #ifdef __BIG_ENDIAN rb_cntl = REG_SET_FIELD(rb_cntl, SDMA_GFX_RB_CNTL, RB_SWAP_ENABLE, 1); diff --git a/drivers/media/test-drivers/vicodec/codec-fwht.c b/drivers/media/test-drivers/vicodec/codec-fwht.c index 1ce682e1b85c32..fd75457d03b202 100644 --- a/drivers/media/test-drivers/vicodec/codec-fwht.c +++ b/drivers/media/test-drivers/vicodec/codec-fwht.c @@ -49,7 +49,7 @@ static const uint8_t zigzag[64] = { /* * noinline_for_stack to work around - * https://bugs.llvm.org/show_bug.cgi?id=38809 + * https://llvm.org/pr38809 */ static int noinline_for_stack rlc(const s16 *in, __be16 *output, int blocktype) diff --git a/drivers/regulator/Kconfig b/drivers/regulator/Kconfig index 550145f82726e9..7db0a29b5b8dcd 100644 --- a/drivers/regulator/Kconfig +++ b/drivers/regulator/Kconfig @@ -288,7 +288,7 @@ config REGULATOR_CROS_EC config REGULATOR_DA903X tristate "Dialog Semiconductor DA9030/DA9034 regulators" depends on PMIC_DA903X - depends on !CC_IS_CLANG # https://bugs.llvm.org/show_bug.cgi?id=38789 + depends on !CC_IS_CLANG # https://llvm.org/pr38789 help Say y here to support the BUCKs and LDOs regulators found on Dialog Semiconductor DA9030/DA9034 PMIC. diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 5dd3a61d673d4f..f7749d0f2562f1 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -984,7 +984,7 @@ * -fsanitize=thread produce unwanted sections (.eh_frame * and .init_array.*), but CONFIG_CONSTRUCTORS wants to * keep any .init_array.* sections. - * https://bugs.llvm.org/show_bug.cgi?id=46478 + * https://llvm.org/pr46478 */ #ifdef CONFIG_UNWIND_TABLES #define DISCARD_EH_FRAME diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan index e6eda054ab275f..98016e137b7f09 100644 --- a/lib/Kconfig.kasan +++ b/lib/Kconfig.kasan @@ -158,7 +158,7 @@ config KASAN_STACK out-of-bounds bugs in stack variables. With Clang, stack instrumentation has a problem that causes excessive - stack usage, see https://bugs.llvm.org/show_bug.cgi?id=38809. Thus, + stack usage, see https://llvm.org/pr38809. Thus, with Clang, this option is deemed unsafe. This option is always disabled when compile-testing with Clang to diff --git a/lib/raid6/Makefile b/lib/raid6/Makefile index 1c5420ff254e84..385a94aa0b999b 100644 --- a/lib/raid6/Makefile +++ b/lib/raid6/Makefile @@ -21,7 +21,7 @@ altivec_flags += -isystem $(shell $(CC) -print-file-name=include) ifdef CONFIG_CC_IS_CLANG # clang ppc port does not yet support -maltivec when -msoft-float is # enabled. A future release of clang will resolve this -# https://bugs.llvm.org/show_bug.cgi?id=31177 +# https://llvm.org/pr31177 CFLAGS_REMOVE_altivec1.o += -msoft-float CFLAGS_REMOVE_altivec2.o += -msoft-float CFLAGS_REMOVE_altivec4.o += -msoft-float diff --git a/lib/stackinit_kunit.c b/lib/stackinit_kunit.c index 05947a2feb93c0..7a10e1d1725817 100644 --- a/lib/stackinit_kunit.c +++ b/lib/stackinit_kunit.c @@ -404,7 +404,7 @@ static noinline int leaf_switch_2_none(unsigned long sp, bool fill, * These are expected to fail for most configurations because neither * GCC nor Clang have a way to perform initialization of variables in * non-code areas (i.e. in a switch statement before the first "case"). - * https://bugs.llvm.org/show_bug.cgi?id=44916 + * https://llvm.org/pr44916 */ DEFINE_TEST_DRIVER(switch_1_none, uint64_t, SCALAR, ALWAYS_FAIL); DEFINE_TEST_DRIVER(switch_2_none, uint64_t, SCALAR, ALWAYS_FAIL); diff --git a/mm/slab_common.c b/mm/slab_common.c index 238293b1dbe14b..954af676d79ee8 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -651,7 +651,7 @@ static struct kmem_cache *__init create_kmalloc_cache(const char *name, struct kmem_cache * kmalloc_caches[NR_KMALLOC_TYPES][KMALLOC_SHIFT_HIGH + 1] __ro_after_init = -{ /* initialization for https://bugs.llvm.org/show_bug.cgi?id=42570 */ }; +{ /* initialization for https://llvm.org/pr42570 */ }; EXPORT_SYMBOL(kmalloc_caches); #ifdef CONFIG_RANDOM_KMALLOC_CACHES diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index d7d021af102981..523f72ac9633d2 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -5043,7 +5043,7 @@ void br_multicast_uninit_stats(struct net_bridge *br) free_percpu(br->mcast_stats); } -/* noinline for https://bugs.llvm.org/show_bug.cgi?id=45802#c9 */ +/* noinline for https://llvm.org/pr45802#c9 */ static noinline_for_stack void mcast_stats_add_dir(u64 *dst, u64 *src) { dst[BR_MCAST_DIR_RX] += src[BR_MCAST_DIR_RX]; diff --git a/security/Kconfig b/security/Kconfig index 52c9af08ad35d3..606a87c29a0170 100644 --- a/security/Kconfig +++ b/security/Kconfig @@ -142,7 +142,7 @@ config HARDENED_USERCOPY config FORTIFY_SOURCE bool "Harden common str/mem functions against buffer overflows" depends on ARCH_HAS_FORTIFY_SOURCE - # https://bugs.llvm.org/show_bug.cgi?id=41459 + # https://llvm.org/pr41459 depends on !CC_IS_CLANG || CLANG_VERSION >= 120001 # https://github.com/llvm/llvm-project/issues/53645 depends on !CC_IS_CLANG || !X86_32 From 28dab03cca2028c790d15d7ad61c4b30c7f5d0d7 Mon Sep 17 00:00:00 2001 From: Wen Yang Date: Mon, 8 Jan 2024 23:51:32 +0800 Subject: [PATCH 568/707] selftests: add eventfd selftests This adds the promised selftest for eventfd. It will verify the flags of eventfd2, including EFD_CLOEXEC, EFD_NONBLOCK and EFD_SEMAPHORE. Link: https://lkml.kernel.org/r/tencent_3C9A298878D22B5D8F79DC2FEE99BB4A8F05@qq.com Signed-off-by: Wen Yang Cc: Shuah Khan Cc: Javier Martinez Canillas Cc: Christian Brauner Cc: Pengfei Xu Cc: Miklos Szeredi Cc: Andrei Vagin Cc: Mathieu Desnoyers Cc: Steven Rostedt Signed-off-by: Andrew Morton --- .../selftests/filesystems/eventfd/.gitignore | 2 + .../selftests/filesystems/eventfd/Makefile | 7 + .../filesystems/eventfd/eventfd_test.c | 186 ++++++++++++++++++ 3 files changed, 195 insertions(+) create mode 100644 tools/testing/selftests/filesystems/eventfd/.gitignore create mode 100644 tools/testing/selftests/filesystems/eventfd/Makefile create mode 100644 tools/testing/selftests/filesystems/eventfd/eventfd_test.c diff --git a/tools/testing/selftests/filesystems/eventfd/.gitignore b/tools/testing/selftests/filesystems/eventfd/.gitignore new file mode 100644 index 00000000000000..483faf59fe4adb --- /dev/null +++ b/tools/testing/selftests/filesystems/eventfd/.gitignore @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only +eventfd_test diff --git a/tools/testing/selftests/filesystems/eventfd/Makefile b/tools/testing/selftests/filesystems/eventfd/Makefile new file mode 100644 index 00000000000000..0a8e3910df1572 --- /dev/null +++ b/tools/testing/selftests/filesystems/eventfd/Makefile @@ -0,0 +1,7 @@ +# SPDX-License-Identifier: GPL-2.0 + +CFLAGS += $(KHDR_INCLUDES) +LDLIBS += -lpthread +TEST_GEN_PROGS := eventfd_test + +include ../../lib.mk diff --git a/tools/testing/selftests/filesystems/eventfd/eventfd_test.c b/tools/testing/selftests/filesystems/eventfd/eventfd_test.c new file mode 100644 index 00000000000000..f142a137526cda --- /dev/null +++ b/tools/testing/selftests/filesystems/eventfd/eventfd_test.c @@ -0,0 +1,186 @@ +// SPDX-License-Identifier: GPL-2.0 + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "../../kselftest_harness.h" + +struct error { + int code; + char msg[512]; +}; + +static int error_set(struct error *err, int code, const char *fmt, ...) +{ + va_list args; + int r; + + if (code == 0 || !err || err->code != 0) + return code; + + err->code = code; + va_start(args, fmt); + r = vsnprintf(err->msg, sizeof(err->msg), fmt, args); + assert((size_t)r < sizeof(err->msg)); + va_end(args); + + return code; +} + +static inline int sys_eventfd2(unsigned int count, int flags) +{ + return syscall(__NR_eventfd2, count, flags); +} + +TEST(eventfd01) +{ + int fd, flags; + + fd = sys_eventfd2(0, 0); + ASSERT_GE(fd, 0); + + flags = fcntl(fd, F_GETFL); + // since the kernel automatically added O_RDWR. + EXPECT_EQ(flags, O_RDWR); + + close(fd); +} + +TEST(eventfd02) +{ + int fd, flags; + + fd = sys_eventfd2(0, EFD_CLOEXEC); + ASSERT_GE(fd, 0); + + flags = fcntl(fd, F_GETFD); + ASSERT_GT(flags, -1); + EXPECT_EQ(flags, FD_CLOEXEC); + + close(fd); +} + +TEST(eventfd03) +{ + int fd, flags; + + fd = sys_eventfd2(0, EFD_NONBLOCK); + ASSERT_GE(fd, 0); + + flags = fcntl(fd, F_GETFL); + ASSERT_GT(flags, -1); + EXPECT_EQ(flags & EFD_NONBLOCK, EFD_NONBLOCK); + EXPECT_EQ(flags & O_RDWR, O_RDWR); + + close(fd); +} + +TEST(eventfd04) +{ + int fd, flags; + + fd = sys_eventfd2(0, EFD_CLOEXEC|EFD_NONBLOCK); + ASSERT_GE(fd, 0); + + flags = fcntl(fd, F_GETFL); + ASSERT_GT(flags, -1); + EXPECT_EQ(flags & EFD_NONBLOCK, EFD_NONBLOCK); + EXPECT_EQ(flags & O_RDWR, O_RDWR); + + flags = fcntl(fd, F_GETFD); + ASSERT_GT(flags, -1); + EXPECT_EQ(flags, FD_CLOEXEC); + + close(fd); +} + +static inline void trim_newline(char *str) +{ + char *pos = strrchr(str, '\n'); + + if (pos) + *pos = '\0'; +} + +static int verify_fdinfo(int fd, struct error *err, const char *prefix, + size_t prefix_len, const char *expect, ...) +{ + char buffer[512] = {0, }; + char path[512] = {0, }; + va_list args; + FILE *f; + char *line = NULL; + size_t n = 0; + int found = 0; + int r; + + va_start(args, expect); + r = vsnprintf(buffer, sizeof(buffer), expect, args); + assert((size_t)r < sizeof(buffer)); + va_end(args); + + snprintf(path, sizeof(path), "/proc/self/fdinfo/%d", fd); + f = fopen(path, "re"); + if (!f) + return error_set(err, -1, "fdinfo open failed for %d", fd); + + while (getline(&line, &n, f) != -1) { + char *val; + + if (strncmp(line, prefix, prefix_len)) + continue; + + found = 1; + + val = line + prefix_len; + r = strcmp(val, buffer); + if (r != 0) { + trim_newline(line); + trim_newline(buffer); + error_set(err, -1, "%s '%s' != '%s'", + prefix, val, buffer); + } + break; + } + + free(line); + fclose(f); + + if (found == 0) + return error_set(err, -1, "%s not found for fd %d", + prefix, fd); + + return 0; +} + +TEST(eventfd05) +{ + struct error err = {0}; + int fd, ret; + + fd = sys_eventfd2(0, EFD_SEMAPHORE); + ASSERT_GE(fd, 0); + + ret = fcntl(fd, F_GETFL); + ASSERT_GT(ret, -1); + EXPECT_EQ(ret & O_RDWR, O_RDWR); + + // The semaphore could only be obtained from fdinfo. + ret = verify_fdinfo(fd, &err, "eventfd-semaphore: ", 19, "1\n"); + if (ret != 0) + ksft_print_msg("eventfd-semaphore check failed, msg: %s\n", + err.msg); + EXPECT_EQ(ret, 0); + + close(fd); +} + +TEST_HARNESS_MAIN From bdb59f8a69909f583b41d6363ce70a6e50d52b85 Mon Sep 17 00:00:00 2001 From: Pierre Gondois Date: Thu, 4 Jan 2024 17:49:33 +0100 Subject: [PATCH 569/707] list: add hlist_count_nodes() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a generic hlist_count_nodes() function and use it in two drivers. This patch (of 3): Add a function to count nodes in a hlist. hlist_count_nodes() is similar to list_count_nodes(). Link: https://lkml.kernel.org/r/20240104164937.424320-1-pierre.gondois@arm.com Link: https://lkml.kernel.org/r/20240104164937.424320-2-pierre.gondois@arm.com Signed-off-by: Pierre Gondois Reviewed-by: Carlos Llamas Acked-by: Coly Li Acked-by: Marco Elver Reviewed-by: Andy Shevchenko Cc: Arve Hjønnevåg Cc: Christian Brauner Cc: Greg Kroah-Hartman Cc: Ingo Molnar Cc: Jani Nikula Cc: Joel Fernandes (Google) Cc: Kees Cook Cc: Kent Overstreet Cc: Martijn Coenen Cc: Suren Baghdasaryan Cc: Todd Kjos Signed-off-by: Andrew Morton --- include/linux/list.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/include/linux/list.h b/include/linux/list.h index 059aa1fff41e9c..523b7c4d000a1f 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -1195,4 +1195,19 @@ static inline void hlist_splice_init(struct hlist_head *from, pos && ({ n = pos->member.next; 1; }); \ pos = hlist_entry_safe(n, typeof(*pos), member)) +/** + * hlist_count_nodes - count nodes in the hlist + * @head: the head for your hlist. + */ +static inline size_t hlist_count_nodes(struct hlist_head *head) +{ + struct hlist_node *pos; + size_t count = 0; + + hlist_for_each(pos, head) + count++; + + return count; +} + #endif From 4490487dd154734cab35578e1e741cf134ff5a30 Mon Sep 17 00:00:00 2001 From: Pierre Gondois Date: Thu, 4 Jan 2024 17:49:34 +0100 Subject: [PATCH 570/707] binder: use of hlist_count_nodes() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make use of the newly added hlist_count_nodes(). Link: https://lkml.kernel.org/r/20240104164937.424320-3-pierre.gondois@arm.com Signed-off-by: Pierre Gondois Acked-by: Carlos Llamas Acked-by: Marco Elver Reviewed-by: Andy Shevchenko Cc: Arve Hjønnevåg Cc: Christian Brauner Cc: Coly Li Cc: Greg Kroah-Hartman Cc: Ingo Molnar Cc: Jani Nikula Cc: Joel Fernandes (Google) Cc: Kees Cook Cc: Kent Overstreet Cc: Martijn Coenen Cc: Suren Baghdasaryan Cc: Todd Kjos Signed-off-by: Andrew Morton --- drivers/android/binder.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 8dd23b19e99731..5a4f8d7aa05169 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -6076,9 +6076,7 @@ static void print_binder_node_nilocked(struct seq_file *m, struct binder_work *w; int count; - count = 0; - hlist_for_each_entry(ref, &node->refs, node_entry) - count++; + count = hlist_count_nodes(&node->refs); seq_printf(m, " node %d: u%016llx c%016llx hs %d hw %d ls %d lw %d is %d iw %d tr %d", node->debug_id, (u64)node->ptr, (u64)node->cookie, From 7a2af157898260b4a323581a92a3b646dc2affad Mon Sep 17 00:00:00 2001 From: Pierre Gondois Date: Thu, 4 Jan 2024 17:49:35 +0100 Subject: [PATCH 571/707] bcache: use of hlist_count_nodes() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make use of the newly added hlist_count_nodes(). Link: https://lkml.kernel.org/r/20240104164937.424320-4-pierre.gondois@arm.com Signed-off-by: Pierre Gondois Acked-by: Coly Li Acked-by: Marco Elver Reviewed-by: Andy Shevchenko Cc: Arve Hjønnevåg Cc: Carlos Llamas Cc: Christian Brauner Cc: Greg Kroah-Hartman Cc: Ingo Molnar Cc: Jani Nikula Cc: Joel Fernandes (Google) Cc: Kees Cook Cc: Kent Overstreet Cc: Martijn Coenen Cc: Suren Baghdasaryan Cc: Todd Kjos Signed-off-by: Andrew Morton --- drivers/md/bcache/sysfs.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index a438efb660699b..6956beb55326f5 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -702,13 +702,7 @@ static unsigned int bch_cache_max_chain(struct cache_set *c) for (h = c->bucket_hash; h < c->bucket_hash + (1 << BUCKET_HASH_BITS); h++) { - unsigned int i = 0; - struct hlist_node *p; - - hlist_for_each(p, h) - i++; - - ret = max(ret, i); + ret = max(ret, hlist_count_nodes(h)); } mutex_unlock(&c->bucket_lock); From 380111fbb447b4ad1dc4658a0741588e95179a41 Mon Sep 17 00:00:00 2001 From: Yongzhen Zhang Date: Mon, 8 Jan 2024 09:56:04 +0800 Subject: [PATCH 572/707] ocfs2: Spelling fix Modify reques to request in the comment. Link: https://lkml.kernel.org/r/20240108015604.38377-1-zhangyongzhen@kylinos.cn Signed-off-by: Yongzhen Zhang Acked-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton --- fs/ocfs2/dlmglue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 64a6ef638495c2..cb40cafbc06237 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -1615,7 +1615,7 @@ static int __ocfs2_cluster_lock(struct ocfs2_super *osb, unlock: lockres_clear_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING); - /* ocfs2_unblock_lock reques on seeing OCFS2_LOCK_UPCONVERT_FINISHING */ + /* ocfs2_unblock_lock request on seeing OCFS2_LOCK_UPCONVERT_FINISHING */ kick_dc = (lockres->l_flags & OCFS2_LOCK_BLOCKED); spin_unlock_irqrestore(&lockres->l_lock, flags); From 321d96d70dd5dea785e5ebde8e3a0cb4bbd63c55 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 7 Jan 2024 14:01:55 -0800 Subject: [PATCH 573/707] lib/win_minmax: fix header comments Don't use "/**" kernel-doc comment marker for non-kernel-doc comment. Correct the filename but omit the path since we know where it is and it could change (but not likely). Link: https://lkml.kernel.org/r/20240107220155.29013-1-rdunlap@infradead.org Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton --- include/linux/win_minmax.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/win_minmax.h b/include/linux/win_minmax.h index 4ca2842d2842d0..6a5bb052fcc27f 100644 --- a/include/linux/win_minmax.h +++ b/include/linux/win_minmax.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/** - * lib/minmax.c: windowed min/max tracker by Kathleen Nichols. +/* + * win_minmax.h: windowed min/max tracker by Kathleen Nichols. * */ #ifndef MINMAX_H From 13f86266e3cd77a19ebabab9f28be63b566eaaee Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Sun, 7 Jan 2024 17:16:41 +0800 Subject: [PATCH 574/707] panic: suppress gnu_printf warning with GCC 13.2.1 and W=1, there's compiling warning like this: kernel/panic.c: In function `__warn': kernel/panic.c:676:17: warning: function `__warn' might be a candidate for `gnu_printf' format attribute [-Wsuggest-attribute=format] 676 | vprintk(args->fmt, args->args); | ^~~~~~~ The normal __printf(x,y) adding can't fix it. So add workaround which disables -Wsuggest-attribute=format to mute it. Link: https://lkml.kernel.org/r/20240107091641.579849-1-bhe@redhat.com Signed-off-by: Baoquan He Signed-off-by: Andrew Morton --- kernel/panic.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/panic.c b/kernel/panic.c index 2807639aab51d1..d49b68184c563e 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -666,8 +666,13 @@ void __warn(const char *file, int line, void *caller, unsigned taint, pr_warn("WARNING: CPU: %d PID: %d at %pS\n", raw_smp_processor_id(), current->pid, caller); +#pragma GCC diagnostic push +#ifndef __clang__ +#pragma GCC diagnostic ignored "-Wsuggest-attribute=format" +#endif if (args) vprintk(args->fmt, args->args); +#pragma GCC diagnostic pop print_modules(); From 74f4422df6e27efcf42e7ffef9340fc3c0b7f402 Mon Sep 17 00:00:00 2001 From: Kuan-Wei Chiu Date: Wed, 10 Jan 2024 16:12:12 +0800 Subject: [PATCH 575/707] lib min_heap: optimize number of calls to min_heapify() Patch series "lib min_heap: Min heap optimizations". The purpose of this patch series is to enhance the existing min heap implementation. The optimization focuses on both the heap construction process and the number of comparisons made during the heapify operation. This patch (of 2): Improve the heap construction process by reducing unnecessary heapify operations. Specifically, adjust the starting condition from n / 2 to n / 2 - 1 in the loop that iterates over all non-leaf elements. Link: https://lkml.kernel.org/r/20240110081213.2289636-1-visitorckw@gmail.com Link: https://lkml.kernel.org/r/20240110081213.2289636-2-visitorckw@gmail.com Signed-off-by: Kuan-Wei Chiu Acked-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Signed-off-by: Andrew Morton --- include/linux/min_heap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/min_heap.h b/include/linux/min_heap.h index 44077837385f89..18a581310eb350 100644 --- a/include/linux/min_heap.h +++ b/include/linux/min_heap.h @@ -70,7 +70,7 @@ void min_heapify_all(struct min_heap *heap, { int i; - for (i = heap->nr / 2; i >= 0; i--) + for (i = heap->nr / 2 - 1; i >= 0; i--) min_heapify(heap, i, func); } From c145b97f2dcb45ad32f59c559c624c55a1c08069 Mon Sep 17 00:00:00 2001 From: Kuan-Wei Chiu Date: Wed, 10 Jan 2024 16:12:13 +0800 Subject: [PATCH 576/707] lib min_heap: optimize number of comparisons in min_heapify() Optimize the min_heapify() function, resulting in a significant reduction of approximately 50% in the number of comparisons for large random inputs, while maintaining identical results. The current implementation performs two comparisons per level to identify the minimum among three elements. In contrast, the proposed bottom-up variation uses only one comparison per level to assess two children until reaching the leaves. Then, it sifts up until the correct position is determined. Typically, the process of sifting down proceeds to the leaf level, resulting in O(1) secondary comparisons instead of log2(n). This optimization significantly reduces the number of costly indirect function calls and improves overall performance. Link: https://lkml.kernel.org/r/20240110081213.2289636-3-visitorckw@gmail.com Signed-off-by: Kuan-Wei Chiu Acked-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Signed-off-by: Andrew Morton --- include/linux/min_heap.h | 42 +++++++++++++++++++++------------------- 1 file changed, 22 insertions(+), 20 deletions(-) diff --git a/include/linux/min_heap.h b/include/linux/min_heap.h index 18a581310eb350..d52daf45861b9a 100644 --- a/include/linux/min_heap.h +++ b/include/linux/min_heap.h @@ -35,31 +35,33 @@ static __always_inline void min_heapify(struct min_heap *heap, int pos, const struct min_heap_callbacks *func) { - void *left, *right, *parent, *smallest; + void *left, *right; void *data = heap->data; + void *root = data + pos * func->elem_size; + int i = pos, j; + /* Find the sift-down path all the way to the leaves. */ for (;;) { - if (pos * 2 + 1 >= heap->nr) + if (i * 2 + 2 >= heap->nr) break; + left = data + (i * 2 + 1) * func->elem_size; + right = data + (i * 2 + 2) * func->elem_size; + i = func->less(left, right) ? i * 2 + 1 : i * 2 + 2; + } - left = data + ((pos * 2 + 1) * func->elem_size); - parent = data + (pos * func->elem_size); - smallest = parent; - if (func->less(left, smallest)) - smallest = left; - - if (pos * 2 + 2 < heap->nr) { - right = data + ((pos * 2 + 2) * func->elem_size); - if (func->less(right, smallest)) - smallest = right; - } - if (smallest == parent) - break; - func->swp(smallest, parent); - if (smallest == left) - pos = (pos * 2) + 1; - else - pos = (pos * 2) + 2; + /* Special case for the last leaf with no sibling. */ + if (i * 2 + 2 == heap->nr) + i = i * 2 + 1; + + /* Backtrack to the correct location. */ + while (i != pos && func->less(root, data + i * func->elem_size)) + i = (i - 1) / 2; + + /* Shift the element into its correct place. */ + j = i; + while (i != pos) { + i = (i - 1) / 2; + func->swp(data + i * func->elem_size, data + j * func->elem_size); } } From 0df2b25ce9d89976fba0c800e5562148a3345a46 Mon Sep 17 00:00:00 2001 From: Alexey Gladkov Date: Mon, 15 Jan 2024 15:46:41 +0000 Subject: [PATCH 577/707] sysctl: allow change system v ipc sysctls inside ipc namespace Patch series "Allow to change ipc/mq sysctls inside ipc namespace", v3. Right now ipc and mq limits count as per ipc namespace, but only real root can change them. By default, the current values of these limits are such that it can only be reduced. Since only root can change the values, it is impossible to reduce these limits in the rootless container. We can allow limit changes within ipc namespace because mq parameters are limited by RLIMIT_MSGQUEUE and ipc parameters are not limited to anything other than cgroups. This patch (of 3): Rootless containers are not allowed to modify kernel IPC parameters. All default limits are set to such high values that in fact there are no limits at all. All limits are not inherited and are initialized to default values when a new ipc_namespace is created. For new ipc_namespace: size_t ipc_ns.shm_ctlmax = SHMMAX; // (ULONG_MAX - (1UL << 24)) size_t ipc_ns.shm_ctlall = SHMALL; // (ULONG_MAX - (1UL << 24)) int ipc_ns.shm_ctlmni = IPCMNI; // (1 << 15) int ipc_ns.shm_rmid_forced = 0; unsigned int ipc_ns.msg_ctlmax = MSGMAX; // 8192 unsigned int ipc_ns.msg_ctlmni = MSGMNI; // 32000 unsigned int ipc_ns.msg_ctlmnb = MSGMNB; // 16384 The shm_tot (total amount of shared pages) has also ceased to be global, it is located in ipc_namespace and is not inherited from anywhere. In such conditions, it cannot be said that these limits limit anything. The real limiter for them is cgroups. If we allow rootless containers to change these parameters, then it can only be reduced. Link: https://lkml.kernel.org/r/cover.1705333426.git.legion@kernel.org Link: https://lkml.kernel.org/r/d2f4603305cbfed58a24755aa61d027314b73a45.1705333426.git.legion@kernel.org Signed-off-by: Alexey Gladkov Signed-off-by: Eric W. Biederman Link: https://lkml.kernel.org/r/e2d84d3ec0172cfff759e6065da84ce0cc2736f8.1663756794.git.legion@kernel.org Cc: Christian Brauner Cc: Joel Granados Cc: Kees Cook Cc: Luis Chamberlain Cc: Manfred Spraul Cc: Davidlohr Bueso Signed-off-by: Andrew Morton --- ipc/ipc_sysctl.c | 37 +++++++++++++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c index 8c62e443f78b3c..01c4a50d22b2d2 100644 --- a/ipc/ipc_sysctl.c +++ b/ipc/ipc_sysctl.c @@ -14,6 +14,7 @@ #include #include #include +#include #include "util.h" static int proc_ipc_dointvec_minmax_orphans(struct ctl_table *table, int write, @@ -190,25 +191,57 @@ static int set_is_seen(struct ctl_table_set *set) return ¤t->nsproxy->ipc_ns->ipc_set == set; } +static void ipc_set_ownership(struct ctl_table_header *head, + struct ctl_table *table, + kuid_t *uid, kgid_t *gid) +{ + struct ipc_namespace *ns = + container_of(head->set, struct ipc_namespace, ipc_set); + + kuid_t ns_root_uid = make_kuid(ns->user_ns, 0); + kgid_t ns_root_gid = make_kgid(ns->user_ns, 0); + + *uid = uid_valid(ns_root_uid) ? ns_root_uid : GLOBAL_ROOT_UID; + *gid = gid_valid(ns_root_gid) ? ns_root_gid : GLOBAL_ROOT_GID; +} + static int ipc_permissions(struct ctl_table_header *head, struct ctl_table *table) { int mode = table->mode; #ifdef CONFIG_CHECKPOINT_RESTORE - struct ipc_namespace *ns = current->nsproxy->ipc_ns; + struct ipc_namespace *ns = + container_of(head->set, struct ipc_namespace, ipc_set); if (((table->data == &ns->ids[IPC_SEM_IDS].next_id) || (table->data == &ns->ids[IPC_MSG_IDS].next_id) || (table->data == &ns->ids[IPC_SHM_IDS].next_id)) && checkpoint_restore_ns_capable(ns->user_ns)) mode = 0666; + else #endif - return mode; + { + kuid_t ns_root_uid; + kgid_t ns_root_gid; + + ipc_set_ownership(head, table, &ns_root_uid, &ns_root_gid); + + if (uid_eq(current_euid(), ns_root_uid)) + mode >>= 6; + + else if (in_egroup_p(ns_root_gid)) + mode >>= 3; + } + + mode &= 7; + + return (mode << 6) | (mode << 3) | mode; } static struct ctl_table_root set_root = { .lookup = set_lookup, .permissions = ipc_permissions, + .set_ownership = ipc_set_ownership, }; bool setup_ipc_sysctls(struct ipc_namespace *ns) From 41d8f2d3ad4aa64c15752928a2edbe9af5470fec Mon Sep 17 00:00:00 2001 From: Alexey Gladkov Date: Mon, 15 Jan 2024 15:46:42 +0000 Subject: [PATCH 578/707] docs: add information about ipc sysctls limitations After 25b21cb2f6d6 ("[PATCH] IPC namespace core") and 4e9823111bdc ("[PATCH] IPC namespace - shm") the shared memory page count stopped being global and started counting per ipc namespace. The documentation and shmget(2) still says that shmall is a global option. shmget(2): SHMALL System-wide limit on the total amount of shared memory, measured in units of the system page size. On Linux, this limit can be read and modified via /proc/sys/kernel/shmall. I think the changes made in 2006 should be documented. Link: https://lkml.kernel.org/r/09e99911071766958af488beb4e8a728a4f12135.1705333426.git.legion@kernel.org Signed-off-by: Alexey Gladkov Signed-off-by: Eric W. Biederman Acked-by: "Eric W. Biederman" Link: https://lkml.kernel.org/r/ede20ddf7be48b93e8084c3be2e920841ee1a641.1663756794.git.legion@kernel.org Cc: Christian Brauner Cc: Davidlohr Bueso Cc: Joel Granados Cc: Kees Cook Cc: Luis Chamberlain Cc: Manfred Spraul Signed-off-by: Andrew Morton --- Documentation/admin-guide/sysctl/kernel.rst | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst index 6584a1f9bfe39d..bc578663619d6e 100644 --- a/Documentation/admin-guide/sysctl/kernel.rst +++ b/Documentation/admin-guide/sysctl/kernel.rst @@ -594,6 +594,9 @@ default (``MSGMNB``). ``msgmni`` is the maximum number of IPC queues. 32000 by default (``MSGMNI``). +All of these parameters are set per ipc namespace. The maximum number of bytes +in POSIX message queues is limited by ``RLIMIT_MSGQUEUE``. This limit is +respected hierarchically in the each user namespace. msg_next_id, sem_next_id, and shm_next_id (System V IPC) ======================================================== @@ -1274,15 +1277,20 @@ are doing anyway :) shmall ====== -This parameter sets the total amount of shared memory pages that -can be used system wide. Hence, ``shmall`` should always be at least -``ceil(shmmax/PAGE_SIZE)``. +This parameter sets the total amount of shared memory pages that can be used +inside ipc namespace. The shared memory pages counting occurs for each ipc +namespace separately and is not inherited. Hence, ``shmall`` should always be at +least ``ceil(shmmax/PAGE_SIZE)``. If you are not sure what the default ``PAGE_SIZE`` is on your Linux system, you can run the following command:: # getconf PAGE_SIZE +To reduce or disable the ability to allocate shared memory, you must create a +new ipc namespace, set this parameter to the required value and prohibit the +creation of a new ipc namespace in the current user namespace or cgroups can +be used. shmmax ====== From 29dcc97276179e45688de7cb53b6b69c3d85d4c4 Mon Sep 17 00:00:00 2001 From: Alexey Gladkov Date: Mon, 15 Jan 2024 15:46:43 +0000 Subject: [PATCH 579/707] sysctl: allow to change limits for posix messages queues All parameters of posix messages queues (queues_max/msg_max/msgsize_max) end up being limited by RLIMIT_MSGQUEUE. The code in mqueue_get_inode is where that limiting happens. The RLIMIT_MSGQUEUE is bound to the user namespace and is counted hierarchically. We can allow root in the user namespace to modify the posix messages queues parameters. Link: https://lkml.kernel.org/r/6ad67f23d1459a4f4339f74aa73bac0ecf3995e1.1705333426.git.legion@kernel.org Signed-off-by: Alexey Gladkov Signed-off-by: Eric W. Biederman Link: https://lkml.kernel.org/r/7eb21211c8622e91d226e63416b1b93c079f60ee.1663756794.git.legion@kernel.org Cc: Christian Brauner Cc: Davidlohr Bueso Cc: Joel Granados Cc: Kees Cook Cc: Luis Chamberlain Cc: Manfred Spraul Signed-off-by: Andrew Morton --- ipc/mq_sysctl.c | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/ipc/mq_sysctl.c b/ipc/mq_sysctl.c index ebb5ed81c151a8..21fba3a6edaf7a 100644 --- a/ipc/mq_sysctl.c +++ b/ipc/mq_sysctl.c @@ -12,6 +12,7 @@ #include #include #include +#include static int msg_max_limit_min = MIN_MSGMAX; static int msg_max_limit_max = HARD_MSGMAX; @@ -76,8 +77,43 @@ static int set_is_seen(struct ctl_table_set *set) return ¤t->nsproxy->ipc_ns->mq_set == set; } +static void mq_set_ownership(struct ctl_table_header *head, + struct ctl_table *table, + kuid_t *uid, kgid_t *gid) +{ + struct ipc_namespace *ns = + container_of(head->set, struct ipc_namespace, mq_set); + + kuid_t ns_root_uid = make_kuid(ns->user_ns, 0); + kgid_t ns_root_gid = make_kgid(ns->user_ns, 0); + + *uid = uid_valid(ns_root_uid) ? ns_root_uid : GLOBAL_ROOT_UID; + *gid = gid_valid(ns_root_gid) ? ns_root_gid : GLOBAL_ROOT_GID; +} + +static int mq_permissions(struct ctl_table_header *head, struct ctl_table *table) +{ + int mode = table->mode; + kuid_t ns_root_uid; + kgid_t ns_root_gid; + + mq_set_ownership(head, table, &ns_root_uid, &ns_root_gid); + + if (uid_eq(current_euid(), ns_root_uid)) + mode >>= 6; + + else if (in_egroup_p(ns_root_gid)) + mode >>= 3; + + mode &= 7; + + return (mode << 6) | (mode << 3) | mode; +} + static struct ctl_table_root set_root = { .lookup = set_lookup, + .permissions = mq_permissions, + .set_ownership = mq_set_ownership, }; bool setup_mq_sysctls(struct ipc_namespace *ns) From 31607d64203b6dc464ab54d76f4d6e39e31a708b Mon Sep 17 00:00:00 2001 From: Li zeming Date: Mon, 15 Jan 2024 14:25:19 +0800 Subject: [PATCH 580/707] user_namespace: Remove unnecessary NULL values from kbuf kbuf is assigned first, so it does not need to initialize the assignment. Link: https://lkml.kernel.org/r/20240115062519.31298-1-zeming@nfschina.com Signed-off-by: Li zeming Cc: Randy Dunlap Signed-off-by: Andrew Morton --- kernel/user_namespace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index ce4d99df5f0eb4..0b0b95418b16a7 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -931,7 +931,7 @@ static ssize_t map_write(struct file *file, const char __user *buf, struct uid_gid_map new_map; unsigned idx; struct uid_gid_extent extent; - char *kbuf = NULL, *pos, *next_line; + char *kbuf, *pos, *next_line; ssize_t ret; /* Only allow < page size writes at the beginning of the file */ From 7908dfd28df944dfbd26111ad0954fdd6e3fdba4 Mon Sep 17 00:00:00 2001 From: Kuan-Wei Chiu Date: Sat, 13 Jan 2024 11:13:51 +0800 Subject: [PATCH 581/707] lib/sort: optimize heapsort for equal elements in sift-down path Patch series "lib/sort: Optimize the number of swaps and comparisons". This patch series aims to optimize the heapsort algorithm, specifically targeting a reduction in the number of swaps and comparisons required. This patch (of 2): Currently, when searching for the sift-down path and encountering equal elements, the algorithm chooses the left child. However, considering that the height of the right subtree may be one less than that of the left subtree, selecting the right child in such cases can potentially reduce the number of comparisons and swaps. For instance, when sorting an array of 10,000 identical elements, the current implementation requires 247,209 comparisons. With this patch, the number of comparisons can be reduced to 227,241. Link: https://lkml.kernel.org/r/20240113031352.2395118-1-visitorckw@gmail.com Link: https://lkml.kernel.org/r/20240113031352.2395118-2-visitorckw@gmail.com Signed-off-by: Kuan-Wei Chiu Signed-off-by: Andrew Morton --- lib/sort.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/sort.c b/lib/sort.c index b399bf10d6759b..fe4efd4a1410f7 100644 --- a/lib/sort.c +++ b/lib/sort.c @@ -262,7 +262,7 @@ void sort_r(void *base, size_t num, size_t size, * average, 3/4 worst-case.) */ for (b = a; c = 2*b + size, (d = c + size) < n;) - b = do_cmp(base + c, base + d, cmp_func, priv) >= 0 ? c : d; + b = do_cmp(base + c, base + d, cmp_func, priv) > 0 ? c : d; if (d == n) /* Special case last leaf with no sibling */ b = c; From 26a69239d7edbc74e50286dade3a99ca1da5446d Mon Sep 17 00:00:00 2001 From: Kuan-Wei Chiu Date: Sat, 13 Jan 2024 11:13:52 +0800 Subject: [PATCH 582/707] lib/sort: Optimize heapsort with double-pop variation Instead of popping only the maximum element from the heap during each iteration, we now pop the two largest elements at once. Although this introduces an additional comparison to determine the second largest element, it enables a reduction in the height of the tree by one during the heapify operations starting from root's left/right child. This reduction in tree height by one leads to a decrease of one comparison and one swap. This optimization results in saving approximately 0.5 * n swaps without increasing the number of comparisons. Additionally, the heap size during heapify is now one less than the original size, offering a chance for further reduction in comparisons and swaps. The following experimental data is based on the array generated using get_random_u32(). | N | swaps (old) | swaps (new) | comparisons (old) | comparisons (new) | |-------|-------------|-------------|-------------------|-------------------| | 1000 | 9054 | 8569 | 10328 | 10320 | | 2000 | 20137 | 19182 | 22634 | 22587 | | 3000 | 32062 | 30623 | 35833 | 35752 | | 4000 | 44274 | 42282 | 49332 | 49306 | | 5000 | 57195 | 54676 | 63300 | 63294 | | 6000 | 70205 | 67202 | 77599 | 77557 | | 7000 | 83276 | 79831 | 92113 | 92032 | | 8000 | 96630 | 92678 | 106635 | 106617 | | 9000 | 110349 | 105883 | 121505 | 121404 | | 10000 | 124165 | 119202 | 136628 | 136617 | Link: https://lkml.kernel.org/r/20240113031352.2395118-3-visitorckw@gmail.com Signed-off-by: Kuan-Wei Chiu Cc: Ching-Chun (Jim) Huang Cc: George Spelvin Signed-off-by: Andrew Morton --- lib/sort.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/lib/sort.c b/lib/sort.c index fe4efd4a1410f7..a0509088f82aa5 100644 --- a/lib/sort.c +++ b/lib/sort.c @@ -215,6 +215,7 @@ void sort_r(void *base, size_t num, size_t size, /* pre-scale counters for performance */ size_t n = num * size, a = (num/2) * size; const unsigned int lsbit = size & -size; /* Used to find parent */ + size_t shift = 0; if (!a) /* num < 2 || size == 0 */ return; @@ -242,12 +243,21 @@ void sort_r(void *base, size_t num, size_t size, for (;;) { size_t b, c, d; - if (a) /* Building heap: sift down --a */ - a -= size; - else if (n -= size) /* Sorting: Extract root to --n */ + if (a) /* Building heap: sift down a */ + a -= size << shift; + else if (n > 3 * size) { /* Sorting: Extract two largest elements */ + n -= size; do_swap(base, base + n, size, swap_func, priv); - else /* Sort complete */ + shift = do_cmp(base + size, base + 2 * size, cmp_func, priv) <= 0; + a = size << shift; + n -= size; + do_swap(base + a, base + n, size, swap_func, priv); + } else if (n > size) { /* Sorting: Extract root */ + n -= size; + do_swap(base, base + n, size, swap_func, priv); + } else { /* Sort complete */ break; + } /* * Sift element at "a" down into heap. This is the From ccef4769a428e3e2056f9698c3212677192c2a09 Mon Sep 17 00:00:00 2001 From: Chen Zhongjin Date: Wed, 17 Jan 2024 06:16:36 +0000 Subject: [PATCH 583/707] kprobes: use synchronize_rcu_tasks_rude in kprobe_optimizer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is a deadlock scenario in kprobe_optimizer(): pid A pid B pid C kprobe_optimizer() do_exit() perf_kprobe_init() mutex_lock(&kprobe_mutex) exit_tasks_rcu_start() mutex_lock(&kprobe_mutex) synchronize_rcu_tasks() zap_pid_ns_processes() // waiting kprobe_mutex // waiting tasks_rcu_exit_srcu kernel_wait4() // waiting pid C exit To avoid this deadlock loop, use synchronize_rcu_tasks_rude() in kprobe_optimizer() rather than synchronize_rcu_tasks(). synchronize_rcu_tasks_rude() can also promise that all preempted tasks have scheduled, but it will not wait tasks_rcu_exit_srcu. Link: https://lkml.kernel.org/r/20240117061636.288412-1-chenzhongjin@huawei.com Fixes: a30b85df7d59 ("kprobes: Use synchronize_rcu_tasks() for optprobe with CONFIG_PREEMPT=y") Signed-off-by: Chen Zhongjin Cc: Anil S Keshavamurthy Cc: David S. Miller Cc: Douglas Anderson Cc: Eric DeVolder Cc: Jakob Koschel Cc: Juerg Haefliger Cc: "Masami Hiramatsu (Google)" Cc: Michael Ellerman (powerpc) Cc: Mickaël Salaün Cc: "Naveen N. Rao" Cc: Nicholas Piggin Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Petr Mladek Cc: Rick Edgecombe Cc: Thomas Gleixner Cc: Yang Jihong Signed-off-by: Andrew Morton --- arch/Kconfig | 2 +- kernel/kprobes.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/Kconfig b/arch/Kconfig index c91917b508736d..fe3942d9f0bf56 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -104,7 +104,7 @@ config STATIC_CALL_SELFTEST config OPTPROBES def_bool y depends on KPROBES && HAVE_OPTPROBES - select TASKS_RCU if PREEMPTION + select TASKS_RUDE_RCU config KPROBES_ON_FTRACE def_bool y diff --git a/kernel/kprobes.c b/kernel/kprobes.c index d5a0ee40bf66c5..09056ae50c5832 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -623,7 +623,7 @@ static void kprobe_optimizer(struct work_struct *work) * Note that on non-preemptive kernel, this is transparently converted * to synchronoze_sched() to wait for all interrupts to have completed. */ - synchronize_rcu_tasks(); + synchronize_rcu_tasks_rude(); /* Step 3: Optimize kprobes after quiesence period */ do_optimize_kprobes(); From 7343ccc1e41c7bc75dad6fc79b45a37c19efa572 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 17 Jan 2024 12:33:11 -0800 Subject: [PATCH 584/707] kprobes-use-synchronize_rcu_tasks_rude-in-kprobe_optimizer-fix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit unrelated comment typo fix Cc: Anil S Keshavamurthy Cc: Chen Zhongjin Cc: David S. Miller Cc: Douglas Anderson Cc: Eric DeVolder Cc: Jakob Koschel Cc: Juerg Haefliger Cc: "Masami Hiramatsu (Google)" Cc: Michael Ellerman Cc: Mickaël Salaün Cc: "Naveen N. Rao" Cc: Nicholas Piggin Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Petr Mladek Cc: Rick Edgecombe Cc: Thomas Gleixner Cc: Yang Jihong Signed-off-by: Andrew Morton --- kernel/kprobes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 09056ae50c5832..e8512177b8bb4a 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -621,7 +621,7 @@ static void kprobe_optimizer(struct work_struct *work) * instruction is preempted. In that case, such tasks can return * to 2nd-Nth byte of jump instruction. This wait is for avoiding it. * Note that on non-preemptive kernel, this is transparently converted - * to synchronoze_sched() to wait for all interrupts to have completed. + * to synchronize_sched() to wait for all interrupts to have completed. */ synchronize_rcu_tasks_rude(); From 24e8b9bdc03565824b4cfbc50c5a8dd61b374421 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 19 Jan 2024 04:13:21 +0800 Subject: [PATCH 585/707] flex_proportions: remove unused fprop_local_single The single variant of flex_proportions is not used. Simply remove it. Link: https://lkml.kernel.org/r/20240118201321.759174-1-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Reviewed-by: Jan Kara Signed-off-by: Andrew Morton --- include/linux/flex_proportions.h | 32 ------------- lib/flex_proportions.c | 77 -------------------------------- 2 files changed, 109 deletions(-) diff --git a/include/linux/flex_proportions.h b/include/linux/flex_proportions.h index 3e378b1fb0bc82..e9a72fd0bfe78b 100644 --- a/include/linux/flex_proportions.h +++ b/include/linux/flex_proportions.h @@ -38,38 +38,6 @@ int fprop_global_init(struct fprop_global *p, gfp_t gfp); void fprop_global_destroy(struct fprop_global *p); bool fprop_new_period(struct fprop_global *p, int periods); -/* - * ---- SINGLE ---- - */ -struct fprop_local_single { - /* the local events counter */ - unsigned long events; - /* Period in which we last updated events */ - unsigned int period; - raw_spinlock_t lock; /* Protect period and numerator */ -}; - -#define INIT_FPROP_LOCAL_SINGLE(name) \ -{ .lock = __RAW_SPIN_LOCK_UNLOCKED(name.lock), \ -} - -int fprop_local_init_single(struct fprop_local_single *pl); -void fprop_local_destroy_single(struct fprop_local_single *pl); -void __fprop_inc_single(struct fprop_global *p, struct fprop_local_single *pl); -void fprop_fraction_single(struct fprop_global *p, - struct fprop_local_single *pl, unsigned long *numerator, - unsigned long *denominator); - -static inline -void fprop_inc_single(struct fprop_global *p, struct fprop_local_single *pl) -{ - unsigned long flags; - - local_irq_save(flags); - __fprop_inc_single(p, pl); - local_irq_restore(flags); -} - /* * ---- PERCPU ---- */ diff --git a/lib/flex_proportions.c b/lib/flex_proportions.c index 83332fefa6f42e..84ecccddc77182 100644 --- a/lib/flex_proportions.c +++ b/lib/flex_proportions.c @@ -83,83 +83,6 @@ bool fprop_new_period(struct fprop_global *p, int periods) return true; } -/* - * ---- SINGLE ---- - */ - -int fprop_local_init_single(struct fprop_local_single *pl) -{ - pl->events = 0; - pl->period = 0; - raw_spin_lock_init(&pl->lock); - return 0; -} - -void fprop_local_destroy_single(struct fprop_local_single *pl) -{ -} - -static void fprop_reflect_period_single(struct fprop_global *p, - struct fprop_local_single *pl) -{ - unsigned int period = p->period; - unsigned long flags; - - /* Fast path - period didn't change */ - if (pl->period == period) - return; - raw_spin_lock_irqsave(&pl->lock, flags); - /* Someone updated pl->period while we were spinning? */ - if (pl->period >= period) { - raw_spin_unlock_irqrestore(&pl->lock, flags); - return; - } - /* Aging zeroed our fraction? */ - if (period - pl->period < BITS_PER_LONG) - pl->events >>= period - pl->period; - else - pl->events = 0; - pl->period = period; - raw_spin_unlock_irqrestore(&pl->lock, flags); -} - -/* Event of type pl happened */ -void __fprop_inc_single(struct fprop_global *p, struct fprop_local_single *pl) -{ - fprop_reflect_period_single(p, pl); - pl->events++; - percpu_counter_add(&p->events, 1); -} - -/* Return fraction of events of type pl */ -void fprop_fraction_single(struct fprop_global *p, - struct fprop_local_single *pl, - unsigned long *numerator, unsigned long *denominator) -{ - unsigned int seq; - s64 num, den; - - do { - seq = read_seqcount_begin(&p->sequence); - fprop_reflect_period_single(p, pl); - num = pl->events; - den = percpu_counter_read_positive(&p->events); - } while (read_seqcount_retry(&p->sequence, seq)); - - /* - * Make fraction <= 1 and denominator > 0 even in presence of percpu - * counter errors - */ - if (den <= num) { - if (num) - den = num; - else - den = 1; - } - *denominator = den; - *numerator = num; -} - /* * ---- PERCPU ---- */ From 8b088f4def0cc472369ac39bb3a3a393160a5c51 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 22 Jan 2024 18:16:31 +0100 Subject: [PATCH 586/707] ptrace_attach: shift send(SIGSTOP) into ptrace_set_stopped() Turn send_sig_info(SIGSTOP) into send_signal_locked(SIGSTOP) and move it from ptrace_attach() to ptrace_set_stopped(). This looks more logical and avoids lock(siglock) right after unlock(). Link: https://lkml.kernel.org/r/20240122171631.GA29844@redhat.com Signed-off-by: Oleg Nesterov Signed-off-by: Andrew Morton --- kernel/ptrace.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 2fabd497d65988..d5f89f9ef29f65 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -375,10 +375,13 @@ static int check_ptrace_options(unsigned long data) return 0; } -static inline void ptrace_set_stopped(struct task_struct *task) +static inline void ptrace_set_stopped(struct task_struct *task, bool seize) { guard(spinlock)(&task->sighand->siglock); + /* SEIZE doesn't trap tracee on attach */ + if (!seize) + send_signal_locked(SIGSTOP, SEND_SIG_PRIV, task, PIDTYPE_PID); /* * If the task is already STOPPED, set JOBCTL_TRAP_STOP and * TRAPPING, and kick it so that it transits to TRACED. TRAPPING @@ -457,14 +460,8 @@ static int ptrace_attach(struct task_struct *task, long request, return -EPERM; task->ptrace = flags; - ptrace_link(task, current); - - /* SEIZE doesn't trap tracee on attach */ - if (!seize) - send_sig_info(SIGSTOP, SEND_SIG_PRIV, task); - - ptrace_set_stopped(task); + ptrace_set_stopped(task, seize); } } From 939f96166af258188876a5d720b0aa3ed97072e0 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Mon, 22 Jan 2024 15:50:43 +0100 Subject: [PATCH 587/707] lib: dhry: remove unneeded Patch series "lib: dhry: miscellaneous cleanups". This patch series contains a few miscellaneous cleanups for the Dhrystone benchmark test. This patch (of 3): The Dhrystone benchmark test does not use mutexes. Link: https://lkml.kernel.org/r/cover.1705934853.git.geert+renesas@glider.be Link: https://lkml.kernel.org/r/cf8fafaedccf96143f1513745c43a457480bfc24.1705934853.git.geert+renesas@glider.be Signed-off-by: Geert Uytterhoeven Signed-off-by: Andrew Morton --- lib/dhry_run.c | 1 - 1 file changed, 1 deletion(-) diff --git a/lib/dhry_run.c b/lib/dhry_run.c index f15ac666e9d38b..e6a279dabf848e 100644 --- a/lib/dhry_run.c +++ b/lib/dhry_run.c @@ -10,7 +10,6 @@ #include #include #include -#include #include #define DHRY_VAX 1757 From 99de0f4878a9c624b4a06f6e9546ec95a3bfa24e Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Mon, 22 Jan 2024 15:50:44 +0100 Subject: [PATCH 588/707] lib: dhry: use ktime_ms_delta() helper Use the existing ktime_ms_delta() helper instead of open-coding the same operation. Link: https://lkml.kernel.org/r/bb43c67a7580de6152f5e6eb225071166d33b6e4.1705934853.git.geert+renesas@glider.be Signed-off-by: Geert Uytterhoeven Signed-off-by: Andrew Morton --- lib/dhry_1.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/dhry_1.c b/lib/dhry_1.c index 08edbbb19f573f..ca6c87232c5809 100644 --- a/lib/dhry_1.c +++ b/lib/dhry_1.c @@ -277,7 +277,7 @@ int dhry(int n) dhry_assert_string_eq(Str_1_Loc, "DHRYSTONE PROGRAM, 1'ST STRING"); dhry_assert_string_eq(Str_2_Loc, "DHRYSTONE PROGRAM, 2'ND STRING"); - User_Time = ktime_to_ms(ktime_sub(End_Time, Begin_Time)); + User_Time = ktime_ms_delta(End_Time, Begin_Time); kfree(Ptr_Glob); kfree(Next_Ptr_Glob); From e495a4c40a4122d495664709823e966d3e0b1916 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Mon, 22 Jan 2024 15:50:45 +0100 Subject: [PATCH 589/707] lib: dhry: add missing closing parenthesis The help text for the Dhrystone benchmark test lacks a matching closing parenthesis. Link: https://lkml.kernel.org/r/772b43271bcb3dd17a6aae671b2084f08c05b079.1705934853.git.geert+renesas@glider.be Signed-off-by: Geert Uytterhoeven Signed-off-by: Andrew Morton --- lib/Kconfig.debug | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 975a07f9f1cc08..8f502f15dc7fb5 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -2142,7 +2142,7 @@ config TEST_DHRY To run the benchmark, it needs to be enabled explicitly, either from the kernel command line (when built-in), or from userspace (when - built-in or modular. + built-in or modular). Run once during kernel boot: From 3fa0dc95b4a2d0d3901b019c087b920c63f1668f Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Mon, 22 Jan 2024 23:01:49 +0900 Subject: [PATCH 590/707] nilfs2: convert segment buffer to use kmap_local In the segment buffer code used for log writing, a CRC calculation routine uses the deprecated kmap_atomic(), so convert it to use kmap_local. Link: https://lkml.kernel.org/r/20240122140202.6950-3-konishi.ryusuke@gmail.com Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/nilfs2/segbuf.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c index 6e59dc19a73249..dc431b4c34c96c 100644 --- a/fs/nilfs2/segbuf.c +++ b/fs/nilfs2/segbuf.c @@ -220,9 +220,9 @@ static void nilfs_segbuf_fill_in_data_crc(struct nilfs_segment_buffer *segbuf, crc = crc32_le(crc, bh->b_data, bh->b_size); } list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) { - kaddr = kmap_atomic(bh->b_page); + kaddr = kmap_local_page(bh->b_page); crc = crc32_le(crc, kaddr + bh_offset(bh), bh->b_size); - kunmap_atomic(kaddr); + kunmap_local(kaddr); } raw_sum->ss_datasum = cpu_to_le32(crc); } From 3f0169197563cc5d0efbf4b20cb7cacf9ef58034 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Mon, 22 Jan 2024 23:01:50 +0900 Subject: [PATCH 591/707] nilfs2: convert nilfs_copy_buffer() to use kmap_local The routine nilfs_copy_buffer() that copies a block buffer still uses the deprecated kmap_atomic(), so convert it to use kmap_local. Link: https://lkml.kernel.org/r/20240122140202.6950-4-konishi.ryusuke@gmail.com Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/nilfs2/page.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index 5c2eba1987bd70..14e470fb88706a 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -103,11 +103,11 @@ void nilfs_copy_buffer(struct buffer_head *dbh, struct buffer_head *sbh) struct page *spage = sbh->b_page, *dpage = dbh->b_page; struct buffer_head *bh; - kaddr0 = kmap_atomic(spage); - kaddr1 = kmap_atomic(dpage); + kaddr0 = kmap_local_page(spage); + kaddr1 = kmap_local_page(dpage); memcpy(kaddr1 + bh_offset(dbh), kaddr0 + bh_offset(sbh), sbh->b_size); - kunmap_atomic(kaddr1); - kunmap_atomic(kaddr0); + kunmap_local(kaddr1); + kunmap_local(kaddr0); dbh->b_state = sbh->b_state & NILFS_BUFFER_INHERENT_BITS; dbh->b_blocknr = sbh->b_blocknr; From 1319083353489215d23d8cdedd5fa09f740e3020 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Mon, 22 Jan 2024 23:01:51 +0900 Subject: [PATCH 592/707] nilfs2: convert metadata file common code to use kmap_local In the common code of metadata files, the new block creation routine nilfs_mdt_insert_new_block() still uses the deprecated kmap_atomic(), so convert it to use kmap_local. Link: https://lkml.kernel.org/r/20240122140202.6950-5-konishi.ryusuke@gmail.com Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/nilfs2/mdt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c index e45c01a559c013..4f792a0ad0f0ff 100644 --- a/fs/nilfs2/mdt.c +++ b/fs/nilfs2/mdt.c @@ -47,12 +47,12 @@ nilfs_mdt_insert_new_block(struct inode *inode, unsigned long block, set_buffer_mapped(bh); - kaddr = kmap_atomic(bh->b_page); + kaddr = kmap_local_page(bh->b_page); memset(kaddr + bh_offset(bh), 0, i_blocksize(inode)); if (init_block) init_block(inode, bh, kaddr); flush_dcache_page(bh->b_page); - kunmap_atomic(kaddr); + kunmap_local(kaddr); set_buffer_uptodate(bh); mark_buffer_dirty(bh); From 690bb2b2b6d4cf3e35ad5c44bad1b46d547ff2e7 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Mon, 22 Jan 2024 23:01:52 +0900 Subject: [PATCH 593/707] nilfs2: convert sufile to use kmap_local Concerning the code of the metadata file sufile for segment management, convert all parts that uses the deprecated kmap_atomic() to use kmap_local. All transformations are directly possible here. Link: https://lkml.kernel.org/r/20240122140202.6950-6-konishi.ryusuke@gmail.com Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/nilfs2/sufile.c | 86 +++++++++++++++++++++++----------------------- 1 file changed, 43 insertions(+), 43 deletions(-) diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c index 0a8119456c2136..abf05dc5750c79 100644 --- a/fs/nilfs2/sufile.c +++ b/fs/nilfs2/sufile.c @@ -107,11 +107,11 @@ static void nilfs_sufile_mod_counter(struct buffer_head *header_bh, struct nilfs_sufile_header *header; void *kaddr; - kaddr = kmap_atomic(header_bh->b_page); + kaddr = kmap_local_page(header_bh->b_page); header = kaddr + bh_offset(header_bh); le64_add_cpu(&header->sh_ncleansegs, ncleanadd); le64_add_cpu(&header->sh_ndirtysegs, ndirtyadd); - kunmap_atomic(kaddr); + kunmap_local(kaddr); mark_buffer_dirty(header_bh); } @@ -315,10 +315,10 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump) ret = nilfs_sufile_get_header_block(sufile, &header_bh); if (ret < 0) goto out_sem; - kaddr = kmap_atomic(header_bh->b_page); + kaddr = kmap_local_page(header_bh->b_page); header = kaddr + bh_offset(header_bh); last_alloc = le64_to_cpu(header->sh_last_alloc); - kunmap_atomic(kaddr); + kunmap_local(kaddr); nsegments = nilfs_sufile_get_nsegments(sufile); maxsegnum = sui->allocmax; @@ -352,7 +352,7 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump) &su_bh); if (ret < 0) goto out_header; - kaddr = kmap_atomic(su_bh->b_page); + kaddr = kmap_local_page(su_bh->b_page); su = nilfs_sufile_block_get_segment_usage( sufile, segnum, su_bh, kaddr); @@ -363,14 +363,14 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump) continue; /* found a clean segment */ nilfs_segment_usage_set_dirty(su); - kunmap_atomic(kaddr); + kunmap_local(kaddr); - kaddr = kmap_atomic(header_bh->b_page); + kaddr = kmap_local_page(header_bh->b_page); header = kaddr + bh_offset(header_bh); le64_add_cpu(&header->sh_ncleansegs, -1); le64_add_cpu(&header->sh_ndirtysegs, 1); header->sh_last_alloc = cpu_to_le64(segnum); - kunmap_atomic(kaddr); + kunmap_local(kaddr); sui->ncleansegs--; mark_buffer_dirty(header_bh); @@ -384,7 +384,7 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump) goto out_header; } - kunmap_atomic(kaddr); + kunmap_local(kaddr); brelse(su_bh); } @@ -406,16 +406,16 @@ void nilfs_sufile_do_cancel_free(struct inode *sufile, __u64 segnum, struct nilfs_segment_usage *su; void *kaddr; - kaddr = kmap_atomic(su_bh->b_page); + kaddr = kmap_local_page(su_bh->b_page); su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr); if (unlikely(!nilfs_segment_usage_clean(su))) { nilfs_warn(sufile->i_sb, "%s: segment %llu must be clean", __func__, (unsigned long long)segnum); - kunmap_atomic(kaddr); + kunmap_local(kaddr); return; } nilfs_segment_usage_set_dirty(su); - kunmap_atomic(kaddr); + kunmap_local(kaddr); nilfs_sufile_mod_counter(header_bh, -1, 1); NILFS_SUI(sufile)->ncleansegs--; @@ -432,11 +432,11 @@ void nilfs_sufile_do_scrap(struct inode *sufile, __u64 segnum, void *kaddr; int clean, dirty; - kaddr = kmap_atomic(su_bh->b_page); + kaddr = kmap_local_page(su_bh->b_page); su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr); if (su->su_flags == cpu_to_le32(BIT(NILFS_SEGMENT_USAGE_DIRTY)) && su->su_nblocks == cpu_to_le32(0)) { - kunmap_atomic(kaddr); + kunmap_local(kaddr); return; } clean = nilfs_segment_usage_clean(su); @@ -446,7 +446,7 @@ void nilfs_sufile_do_scrap(struct inode *sufile, __u64 segnum, su->su_lastmod = cpu_to_le64(0); su->su_nblocks = cpu_to_le32(0); su->su_flags = cpu_to_le32(BIT(NILFS_SEGMENT_USAGE_DIRTY)); - kunmap_atomic(kaddr); + kunmap_local(kaddr); nilfs_sufile_mod_counter(header_bh, clean ? (u64)-1 : 0, dirty ? 0 : 1); NILFS_SUI(sufile)->ncleansegs -= clean; @@ -463,12 +463,12 @@ void nilfs_sufile_do_free(struct inode *sufile, __u64 segnum, void *kaddr; int sudirty; - kaddr = kmap_atomic(su_bh->b_page); + kaddr = kmap_local_page(su_bh->b_page); su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr); if (nilfs_segment_usage_clean(su)) { nilfs_warn(sufile->i_sb, "%s: segment %llu is already clean", __func__, (unsigned long long)segnum); - kunmap_atomic(kaddr); + kunmap_local(kaddr); return; } if (unlikely(nilfs_segment_usage_error(su))) @@ -481,7 +481,7 @@ void nilfs_sufile_do_free(struct inode *sufile, __u64 segnum, (unsigned long long)segnum); nilfs_segment_usage_set_clean(su); - kunmap_atomic(kaddr); + kunmap_local(kaddr); mark_buffer_dirty(su_bh); nilfs_sufile_mod_counter(header_bh, 1, sudirty ? (u64)-1 : 0); @@ -509,12 +509,12 @@ int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum) if (ret) goto out_sem; - kaddr = kmap_atomic(bh->b_page); + kaddr = kmap_local_page(bh->b_page); su = nilfs_sufile_block_get_segment_usage(sufile, segnum, bh, kaddr); if (unlikely(nilfs_segment_usage_error(su))) { struct the_nilfs *nilfs = sufile->i_sb->s_fs_info; - kunmap_atomic(kaddr); + kunmap_local(kaddr); brelse(bh); if (nilfs_segment_is_active(nilfs, segnum)) { nilfs_error(sufile->i_sb, @@ -532,7 +532,7 @@ int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum) ret = -EIO; } else { nilfs_segment_usage_set_dirty(su); - kunmap_atomic(kaddr); + kunmap_local(kaddr); mark_buffer_dirty(bh); nilfs_mdt_mark_dirty(sufile); brelse(bh); @@ -562,7 +562,7 @@ int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum, if (ret < 0) goto out_sem; - kaddr = kmap_atomic(bh->b_page); + kaddr = kmap_local_page(bh->b_page); su = nilfs_sufile_block_get_segment_usage(sufile, segnum, bh, kaddr); if (modtime) { /* @@ -573,7 +573,7 @@ int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum, su->su_lastmod = cpu_to_le64(modtime); } su->su_nblocks = cpu_to_le32(nblocks); - kunmap_atomic(kaddr); + kunmap_local(kaddr); mark_buffer_dirty(bh); nilfs_mdt_mark_dirty(sufile); @@ -614,7 +614,7 @@ int nilfs_sufile_get_stat(struct inode *sufile, struct nilfs_sustat *sustat) if (ret < 0) goto out_sem; - kaddr = kmap_atomic(header_bh->b_page); + kaddr = kmap_local_page(header_bh->b_page); header = kaddr + bh_offset(header_bh); sustat->ss_nsegs = nilfs_sufile_get_nsegments(sufile); sustat->ss_ncleansegs = le64_to_cpu(header->sh_ncleansegs); @@ -624,7 +624,7 @@ int nilfs_sufile_get_stat(struct inode *sufile, struct nilfs_sustat *sustat) spin_lock(&nilfs->ns_last_segment_lock); sustat->ss_prot_seq = nilfs->ns_prot_seq; spin_unlock(&nilfs->ns_last_segment_lock); - kunmap_atomic(kaddr); + kunmap_local(kaddr); brelse(header_bh); out_sem: @@ -640,15 +640,15 @@ void nilfs_sufile_do_set_error(struct inode *sufile, __u64 segnum, void *kaddr; int suclean; - kaddr = kmap_atomic(su_bh->b_page); + kaddr = kmap_local_page(su_bh->b_page); su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr); if (nilfs_segment_usage_error(su)) { - kunmap_atomic(kaddr); + kunmap_local(kaddr); return; } suclean = nilfs_segment_usage_clean(su); nilfs_segment_usage_set_error(su); - kunmap_atomic(kaddr); + kunmap_local(kaddr); if (suclean) { nilfs_sufile_mod_counter(header_bh, -1, 0); @@ -717,7 +717,7 @@ static int nilfs_sufile_truncate_range(struct inode *sufile, /* hole */ continue; } - kaddr = kmap_atomic(su_bh->b_page); + kaddr = kmap_local_page(su_bh->b_page); su = nilfs_sufile_block_get_segment_usage( sufile, segnum, su_bh, kaddr); su2 = su; @@ -726,7 +726,7 @@ static int nilfs_sufile_truncate_range(struct inode *sufile, ~BIT(NILFS_SEGMENT_USAGE_ERROR)) || nilfs_segment_is_active(nilfs, segnum + j)) { ret = -EBUSY; - kunmap_atomic(kaddr); + kunmap_local(kaddr); brelse(su_bh); goto out_header; } @@ -738,7 +738,7 @@ static int nilfs_sufile_truncate_range(struct inode *sufile, nc++; } } - kunmap_atomic(kaddr); + kunmap_local(kaddr); if (nc > 0) { mark_buffer_dirty(su_bh); ncleaned += nc; @@ -823,10 +823,10 @@ int nilfs_sufile_resize(struct inode *sufile, __u64 newnsegs) sui->allocmin = 0; } - kaddr = kmap_atomic(header_bh->b_page); + kaddr = kmap_local_page(header_bh->b_page); header = kaddr + bh_offset(header_bh); header->sh_ncleansegs = cpu_to_le64(sui->ncleansegs); - kunmap_atomic(kaddr); + kunmap_local(kaddr); mark_buffer_dirty(header_bh); nilfs_mdt_mark_dirty(sufile); @@ -891,7 +891,7 @@ ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, void *buf, continue; } - kaddr = kmap_atomic(su_bh->b_page); + kaddr = kmap_local_page(su_bh->b_page); su = nilfs_sufile_block_get_segment_usage( sufile, segnum, su_bh, kaddr); for (j = 0; j < n; @@ -904,7 +904,7 @@ ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, void *buf, si->sui_flags |= BIT(NILFS_SEGMENT_USAGE_ACTIVE); } - kunmap_atomic(kaddr); + kunmap_local(kaddr); brelse(su_bh); } ret = nsegs; @@ -973,7 +973,7 @@ ssize_t nilfs_sufile_set_suinfo(struct inode *sufile, void *buf, goto out_header; for (;;) { - kaddr = kmap_atomic(bh->b_page); + kaddr = kmap_local_page(bh->b_page); su = nilfs_sufile_block_get_segment_usage( sufile, sup->sup_segnum, bh, kaddr); @@ -1010,7 +1010,7 @@ ssize_t nilfs_sufile_set_suinfo(struct inode *sufile, void *buf, su->su_flags = cpu_to_le32(sup->sup_sui.sui_flags); } - kunmap_atomic(kaddr); + kunmap_local(kaddr); sup = (void *)sup + supsz; if (sup >= supend) @@ -1115,7 +1115,7 @@ int nilfs_sufile_trim_fs(struct inode *sufile, struct fstrim_range *range) continue; } - kaddr = kmap_atomic(su_bh->b_page); + kaddr = kmap_local_page(su_bh->b_page); su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr); for (i = 0; i < n; ++i, ++segnum, su = (void *)su + susz) { @@ -1145,7 +1145,7 @@ int nilfs_sufile_trim_fs(struct inode *sufile, struct fstrim_range *range) } if (nblocks >= minlen) { - kunmap_atomic(kaddr); + kunmap_local(kaddr); ret = blkdev_issue_discard(nilfs->ns_bdev, start * sects_per_block, @@ -1157,7 +1157,7 @@ int nilfs_sufile_trim_fs(struct inode *sufile, struct fstrim_range *range) } ndiscarded += nblocks; - kaddr = kmap_atomic(su_bh->b_page); + kaddr = kmap_local_page(su_bh->b_page); su = nilfs_sufile_block_get_segment_usage( sufile, segnum, su_bh, kaddr); } @@ -1166,7 +1166,7 @@ int nilfs_sufile_trim_fs(struct inode *sufile, struct fstrim_range *range) start = seg_start; nblocks = seg_end - seg_start + 1; } - kunmap_atomic(kaddr); + kunmap_local(kaddr); put_bh(su_bh); } @@ -1246,10 +1246,10 @@ int nilfs_sufile_read(struct super_block *sb, size_t susize, goto failed; sui = NILFS_SUI(sufile); - kaddr = kmap_atomic(header_bh->b_page); + kaddr = kmap_local_page(header_bh->b_page); header = kaddr + bh_offset(header_bh); sui->ncleansegs = le64_to_cpu(header->sh_ncleansegs); - kunmap_atomic(kaddr); + kunmap_local(kaddr); brelse(header_bh); sui->allocmax = nilfs_sufile_get_nsegments(sufile) - 1; From 3ce4276b207eda048014dabc2295bf056278fdb9 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Mon, 22 Jan 2024 23:01:53 +0900 Subject: [PATCH 594/707] nilfs2: convert persistent object allocator to use kmap_local Regarding the allocator code that is commonly used in the ondisk inode metadata file ifile and the disk address translation metadata file DAT, convert the parts that use the deprecated kmap_atomic() and kmap() to use kmap_local. Most can be converted directly, but only nilfs_palloc_prepare_alloc_entry() needs to be rewritten to change mapping sections so that multiple kmap_local/kunmap_local calls are nested and disk I/O can be avoided within the mapping sections. Link: https://lkml.kernel.org/r/20240122140202.6950-7-konishi.ryusuke@gmail.com Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/nilfs2/alloc.c | 91 ++++++++++++++++++++++++----------------------- 1 file changed, 46 insertions(+), 45 deletions(-) diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c index 7342de296ec3c6..89caef7513db35 100644 --- a/fs/nilfs2/alloc.c +++ b/fs/nilfs2/alloc.c @@ -525,54 +525,55 @@ int nilfs_palloc_prepare_alloc_entry(struct inode *inode, ret = nilfs_palloc_get_desc_block(inode, group, 1, &desc_bh); if (ret < 0) return ret; - desc_kaddr = kmap(desc_bh->b_page); + desc_kaddr = kmap_local_page(desc_bh->b_page); desc = nilfs_palloc_block_get_group_desc( inode, group, desc_bh, desc_kaddr); n = nilfs_palloc_rest_groups_in_desc_block(inode, group, maxgroup); - for (j = 0; j < n; j++, desc++, group++) { + for (j = 0; j < n; j++, desc++, group++, group_offset = 0) { lock = nilfs_mdt_bgl_lock(inode, group); - if (nilfs_palloc_group_desc_nfrees(desc, lock) > 0) { - ret = nilfs_palloc_get_bitmap_block( - inode, group, 1, &bitmap_bh); - if (ret < 0) - goto out_desc; - bitmap_kaddr = kmap(bitmap_bh->b_page); - bitmap = bitmap_kaddr + bh_offset(bitmap_bh); - pos = nilfs_palloc_find_available_slot( - bitmap, group_offset, - entries_per_group, lock); - if (pos >= 0) { - /* found a free entry */ - nilfs_palloc_group_desc_add_entries( - desc, lock, -1); - req->pr_entry_nr = - entries_per_group * group + pos; - kunmap(desc_bh->b_page); - kunmap(bitmap_bh->b_page); - - req->pr_desc_bh = desc_bh; - req->pr_bitmap_bh = bitmap_bh; - return 0; - } - kunmap(bitmap_bh->b_page); - brelse(bitmap_bh); + if (nilfs_palloc_group_desc_nfrees(desc, lock) == 0) + continue; + + kunmap_local(desc_kaddr); + ret = nilfs_palloc_get_bitmap_block(inode, group, 1, + &bitmap_bh); + if (unlikely(ret < 0)) { + brelse(desc_bh); + return ret; } - group_offset = 0; + desc_kaddr = kmap_local_page(desc_bh->b_page); + desc = nilfs_palloc_block_get_group_desc( + inode, group, desc_bh, desc_kaddr); + + bitmap_kaddr = kmap_local_page(bitmap_bh->b_page); + bitmap = bitmap_kaddr + bh_offset(bitmap_bh); + pos = nilfs_palloc_find_available_slot( + bitmap, group_offset, entries_per_group, lock); + kunmap_local(bitmap_kaddr); + if (pos >= 0) + goto found; + + brelse(bitmap_bh); } - kunmap(desc_bh->b_page); + kunmap_local(desc_kaddr); brelse(desc_bh); } /* no entries left */ return -ENOSPC; - out_desc: - kunmap(desc_bh->b_page); - brelse(desc_bh); - return ret; +found: + /* found a free entry */ + nilfs_palloc_group_desc_add_entries(desc, lock, -1); + req->pr_entry_nr = entries_per_group * group + pos; + kunmap_local(desc_kaddr); + + req->pr_desc_bh = desc_bh; + req->pr_bitmap_bh = bitmap_bh; + return 0; } /** @@ -606,10 +607,10 @@ void nilfs_palloc_commit_free_entry(struct inode *inode, spinlock_t *lock; group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset); - desc_kaddr = kmap(req->pr_desc_bh->b_page); + desc_kaddr = kmap_local_page(req->pr_desc_bh->b_page); desc = nilfs_palloc_block_get_group_desc(inode, group, req->pr_desc_bh, desc_kaddr); - bitmap_kaddr = kmap(req->pr_bitmap_bh->b_page); + bitmap_kaddr = kmap_local_page(req->pr_bitmap_bh->b_page); bitmap = bitmap_kaddr + bh_offset(req->pr_bitmap_bh); lock = nilfs_mdt_bgl_lock(inode, group); @@ -621,8 +622,8 @@ void nilfs_palloc_commit_free_entry(struct inode *inode, else nilfs_palloc_group_desc_add_entries(desc, lock, 1); - kunmap(req->pr_bitmap_bh->b_page); - kunmap(req->pr_desc_bh->b_page); + kunmap_local(bitmap_kaddr); + kunmap_local(desc_kaddr); mark_buffer_dirty(req->pr_desc_bh); mark_buffer_dirty(req->pr_bitmap_bh); @@ -647,10 +648,10 @@ void nilfs_palloc_abort_alloc_entry(struct inode *inode, spinlock_t *lock; group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset); - desc_kaddr = kmap(req->pr_desc_bh->b_page); + desc_kaddr = kmap_local_page(req->pr_desc_bh->b_page); desc = nilfs_palloc_block_get_group_desc(inode, group, req->pr_desc_bh, desc_kaddr); - bitmap_kaddr = kmap(req->pr_bitmap_bh->b_page); + bitmap_kaddr = kmap_local_page(req->pr_bitmap_bh->b_page); bitmap = bitmap_kaddr + bh_offset(req->pr_bitmap_bh); lock = nilfs_mdt_bgl_lock(inode, group); @@ -662,8 +663,8 @@ void nilfs_palloc_abort_alloc_entry(struct inode *inode, else nilfs_palloc_group_desc_add_entries(desc, lock, 1); - kunmap(req->pr_bitmap_bh->b_page); - kunmap(req->pr_desc_bh->b_page); + kunmap_local(bitmap_kaddr); + kunmap_local(desc_kaddr); brelse(req->pr_bitmap_bh); brelse(req->pr_desc_bh); @@ -755,7 +756,7 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems) /* Get the first entry number of the group */ group_min_nr = (__u64)group * epg; - bitmap_kaddr = kmap(bitmap_bh->b_page); + bitmap_kaddr = kmap_local_page(bitmap_bh->b_page); bitmap = bitmap_kaddr + bh_offset(bitmap_bh); lock = nilfs_mdt_bgl_lock(inode, group); @@ -801,7 +802,7 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems) entry_start = rounddown(group_offset, epb); } while (true); - kunmap(bitmap_bh->b_page); + kunmap_local(bitmap_kaddr); mark_buffer_dirty(bitmap_bh); brelse(bitmap_bh); @@ -815,11 +816,11 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems) inode->i_ino); } - desc_kaddr = kmap_atomic(desc_bh->b_page); + desc_kaddr = kmap_local_page(desc_bh->b_page); desc = nilfs_palloc_block_get_group_desc( inode, group, desc_bh, desc_kaddr); nfree = nilfs_palloc_group_desc_add_entries(desc, lock, n); - kunmap_atomic(desc_kaddr); + kunmap_local(desc_kaddr); mark_buffer_dirty(desc_bh); nilfs_mdt_mark_dirty(inode); brelse(desc_bh); From 676cc3b2e60e1b183026a3ecc0e92d3449d62946 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Mon, 22 Jan 2024 23:01:54 +0900 Subject: [PATCH 595/707] nilfs2: convert DAT to use kmap_local Concerning the code of the metadata file DAT for disk address translation, convert all parts that use the deprecated kmap_atomic to use kmap_local. All transformations are directly possible. Link: https://lkml.kernel.org/r/20240122140202.6950-8-konishi.ryusuke@gmail.com Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/nilfs2/dat.c | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c index 9cf6ba58f5859f..8f71f8b0e2188b 100644 --- a/fs/nilfs2/dat.c +++ b/fs/nilfs2/dat.c @@ -91,13 +91,13 @@ void nilfs_dat_commit_alloc(struct inode *dat, struct nilfs_palloc_req *req) struct nilfs_dat_entry *entry; void *kaddr; - kaddr = kmap_atomic(req->pr_entry_bh->b_page); + kaddr = kmap_local_page(req->pr_entry_bh->b_page); entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr, req->pr_entry_bh, kaddr); entry->de_start = cpu_to_le64(NILFS_CNO_MIN); entry->de_end = cpu_to_le64(NILFS_CNO_MAX); entry->de_blocknr = cpu_to_le64(0); - kunmap_atomic(kaddr); + kunmap_local(kaddr); nilfs_palloc_commit_alloc_entry(dat, req); nilfs_dat_commit_entry(dat, req); @@ -115,13 +115,13 @@ static void nilfs_dat_commit_free(struct inode *dat, struct nilfs_dat_entry *entry; void *kaddr; - kaddr = kmap_atomic(req->pr_entry_bh->b_page); + kaddr = kmap_local_page(req->pr_entry_bh->b_page); entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr, req->pr_entry_bh, kaddr); entry->de_start = cpu_to_le64(NILFS_CNO_MIN); entry->de_end = cpu_to_le64(NILFS_CNO_MIN); entry->de_blocknr = cpu_to_le64(0); - kunmap_atomic(kaddr); + kunmap_local(kaddr); nilfs_dat_commit_entry(dat, req); @@ -145,12 +145,12 @@ void nilfs_dat_commit_start(struct inode *dat, struct nilfs_palloc_req *req, struct nilfs_dat_entry *entry; void *kaddr; - kaddr = kmap_atomic(req->pr_entry_bh->b_page); + kaddr = kmap_local_page(req->pr_entry_bh->b_page); entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr, req->pr_entry_bh, kaddr); entry->de_start = cpu_to_le64(nilfs_mdt_cno(dat)); entry->de_blocknr = cpu_to_le64(blocknr); - kunmap_atomic(kaddr); + kunmap_local(kaddr); nilfs_dat_commit_entry(dat, req); } @@ -167,12 +167,12 @@ int nilfs_dat_prepare_end(struct inode *dat, struct nilfs_palloc_req *req) if (ret < 0) return ret; - kaddr = kmap_atomic(req->pr_entry_bh->b_page); + kaddr = kmap_local_page(req->pr_entry_bh->b_page); entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr, req->pr_entry_bh, kaddr); start = le64_to_cpu(entry->de_start); blocknr = le64_to_cpu(entry->de_blocknr); - kunmap_atomic(kaddr); + kunmap_local(kaddr); if (blocknr == 0) { ret = nilfs_palloc_prepare_free_entry(dat, req); @@ -202,7 +202,7 @@ void nilfs_dat_commit_end(struct inode *dat, struct nilfs_palloc_req *req, sector_t blocknr; void *kaddr; - kaddr = kmap_atomic(req->pr_entry_bh->b_page); + kaddr = kmap_local_page(req->pr_entry_bh->b_page); entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr, req->pr_entry_bh, kaddr); end = start = le64_to_cpu(entry->de_start); @@ -212,7 +212,7 @@ void nilfs_dat_commit_end(struct inode *dat, struct nilfs_palloc_req *req, } entry->de_end = cpu_to_le64(end); blocknr = le64_to_cpu(entry->de_blocknr); - kunmap_atomic(kaddr); + kunmap_local(kaddr); if (blocknr == 0) nilfs_dat_commit_free(dat, req); @@ -227,12 +227,12 @@ void nilfs_dat_abort_end(struct inode *dat, struct nilfs_palloc_req *req) sector_t blocknr; void *kaddr; - kaddr = kmap_atomic(req->pr_entry_bh->b_page); + kaddr = kmap_local_page(req->pr_entry_bh->b_page); entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr, req->pr_entry_bh, kaddr); start = le64_to_cpu(entry->de_start); blocknr = le64_to_cpu(entry->de_blocknr); - kunmap_atomic(kaddr); + kunmap_local(kaddr); if (start == nilfs_mdt_cno(dat) && blocknr == 0) nilfs_palloc_abort_free_entry(dat, req); @@ -362,7 +362,7 @@ int nilfs_dat_move(struct inode *dat, __u64 vblocknr, sector_t blocknr) } } - kaddr = kmap_atomic(entry_bh->b_page); + kaddr = kmap_local_page(entry_bh->b_page); entry = nilfs_palloc_block_get_entry(dat, vblocknr, entry_bh, kaddr); if (unlikely(entry->de_blocknr == cpu_to_le64(0))) { nilfs_crit(dat->i_sb, @@ -370,13 +370,13 @@ int nilfs_dat_move(struct inode *dat, __u64 vblocknr, sector_t blocknr) __func__, (unsigned long long)vblocknr, (unsigned long long)le64_to_cpu(entry->de_start), (unsigned long long)le64_to_cpu(entry->de_end)); - kunmap_atomic(kaddr); + kunmap_local(kaddr); brelse(entry_bh); return -EINVAL; } WARN_ON(blocknr == 0); entry->de_blocknr = cpu_to_le64(blocknr); - kunmap_atomic(kaddr); + kunmap_local(kaddr); mark_buffer_dirty(entry_bh); nilfs_mdt_mark_dirty(dat); @@ -426,7 +426,7 @@ int nilfs_dat_translate(struct inode *dat, __u64 vblocknr, sector_t *blocknrp) } } - kaddr = kmap_atomic(entry_bh->b_page); + kaddr = kmap_local_page(entry_bh->b_page); entry = nilfs_palloc_block_get_entry(dat, vblocknr, entry_bh, kaddr); blocknr = le64_to_cpu(entry->de_blocknr); if (blocknr == 0) { @@ -436,7 +436,7 @@ int nilfs_dat_translate(struct inode *dat, __u64 vblocknr, sector_t *blocknrp) *blocknrp = blocknr; out: - kunmap_atomic(kaddr); + kunmap_local(kaddr); brelse(entry_bh); return ret; } @@ -457,7 +457,7 @@ ssize_t nilfs_dat_get_vinfo(struct inode *dat, void *buf, unsigned int visz, 0, &entry_bh); if (ret < 0) return ret; - kaddr = kmap_atomic(entry_bh->b_page); + kaddr = kmap_local_page(entry_bh->b_page); /* last virtual block number in this block */ first = vinfo->vi_vblocknr; do_div(first, entries_per_block); @@ -473,7 +473,7 @@ ssize_t nilfs_dat_get_vinfo(struct inode *dat, void *buf, unsigned int visz, vinfo->vi_end = le64_to_cpu(entry->de_end); vinfo->vi_blocknr = le64_to_cpu(entry->de_blocknr); } - kunmap_atomic(kaddr); + kunmap_local(kaddr); brelse(entry_bh); } From 5705f6e6e8b5c0a35324d4596e93624fc57df291 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Mon, 22 Jan 2024 23:01:55 +0900 Subject: [PATCH 596/707] nilfs2: move nilfs_bmap_write call out of nilfs_write_inode_common Before converting the disk inode management metadata file ifile, the call to nilfs_bmap_write(), the i_device_code setting, and the zero-fill code for inodes on the super root block are moved from nilfs_write_inode_common() to its callers. This cleanup simplifies the role and arguments of nilfs_write_inode_common() and collects calls to nilfs_bmap_write() to the log writing code. Also, add and use a new helper nilfs_write_root_mdt_inode() to avoid code duplication in the data export routine nilfs_segctor_fill_in_super_root() to the super root block's buffer. Link: https://lkml.kernel.org/r/20240122140202.6950-9-konishi.ryusuke@gmail.com Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/nilfs2/inode.c | 38 +++++++++++++++--------------------- fs/nilfs2/nilfs.h | 3 ++- fs/nilfs2/segment.c | 47 +++++++++++++++++++++++++++++++++++++-------- 3 files changed, 57 insertions(+), 31 deletions(-) diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 9c334c722fc1c1..b9d40f5e94d32a 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -759,8 +759,18 @@ struct inode *nilfs_iget_for_shadow(struct inode *inode) return s_inode; } +/** + * nilfs_write_inode_common - export common inode information to on-disk inode + * @inode: inode object + * @raw_inode: on-disk inode + * + * This function writes standard information from the on-memory inode @inode + * to @raw_inode on ifile, cpfile or a super root block. Since inode bmap + * data is not exported, nilfs_bmap_write() must be called separately during + * log writing. + */ void nilfs_write_inode_common(struct inode *inode, - struct nilfs_inode *raw_inode, int has_bmap) + struct nilfs_inode *raw_inode) { struct nilfs_inode_info *ii = NILFS_I(inode); @@ -778,21 +788,6 @@ void nilfs_write_inode_common(struct inode *inode, raw_inode->i_flags = cpu_to_le32(ii->i_flags); raw_inode->i_generation = cpu_to_le32(inode->i_generation); - if (NILFS_ROOT_METADATA_FILE(inode->i_ino)) { - struct the_nilfs *nilfs = inode->i_sb->s_fs_info; - - /* zero-fill unused portion in the case of super root block */ - raw_inode->i_xattr = 0; - raw_inode->i_pad = 0; - memset((void *)raw_inode + sizeof(*raw_inode), 0, - nilfs->ns_inode_size - sizeof(*raw_inode)); - } - - if (has_bmap) - nilfs_bmap_write(ii->i_bmap, raw_inode); - else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) - raw_inode->i_device_code = - cpu_to_le64(huge_encode_dev(inode->i_rdev)); /* * When extending inode, nilfs->ns_inode_size should be checked * for substitutions of appended fields. @@ -813,12 +808,11 @@ void nilfs_update_inode(struct inode *inode, struct buffer_head *ibh, int flags) if (flags & I_DIRTY_DATASYNC) set_bit(NILFS_I_INODE_SYNC, &ii->i_state); - nilfs_write_inode_common(inode, raw_inode, 0); - /* - * XXX: call with has_bmap = 0 is a workaround to avoid - * deadlock of bmap. This delays update of i_bmap to just - * before writing. - */ + nilfs_write_inode_common(inode, raw_inode); + + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) + raw_inode->i_device_code = + cpu_to_le64(huge_encode_dev(inode->i_rdev)); nilfs_ifile_unmap_inode(ifile, ino, ibh); } diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h index 98cffaf0ac1277..2e29b98ba8bab2 100644 --- a/fs/nilfs2/nilfs.h +++ b/fs/nilfs2/nilfs.h @@ -256,7 +256,8 @@ extern struct inode *nilfs_new_inode(struct inode *, umode_t); extern int nilfs_get_block(struct inode *, sector_t, struct buffer_head *, int); extern void nilfs_set_inode_flags(struct inode *); extern int nilfs_read_inode_common(struct inode *, struct nilfs_inode *); -extern void nilfs_write_inode_common(struct inode *, struct nilfs_inode *, int); +void nilfs_write_inode_common(struct inode *inode, + struct nilfs_inode *raw_inode); struct inode *nilfs_ilookup(struct super_block *sb, struct nilfs_root *root, unsigned long ino); struct inode *nilfs_iget_locked(struct super_block *sb, struct nilfs_root *root, diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 2590a0860eab02..b512e772846568 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -913,6 +913,7 @@ static int nilfs_segctor_fill_in_checkpoint(struct nilfs_sc_info *sci) struct the_nilfs *nilfs = sci->sc_super->s_fs_info; struct buffer_head *bh_cp; struct nilfs_checkpoint *raw_cp; + struct inode *ifile; int err; err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, 0, @@ -941,8 +942,10 @@ static int nilfs_segctor_fill_in_checkpoint(struct nilfs_sc_info *sci) else nilfs_checkpoint_set_minor(raw_cp); - nilfs_write_inode_common(sci->sc_root->ifile, - &raw_cp->cp_ifile_inode, 1); + ifile = sci->sc_root->ifile; + nilfs_write_inode_common(ifile, &raw_cp->cp_ifile_inode); + nilfs_bmap_write(NILFS_I(ifile)->i_bmap, &raw_cp->cp_ifile_inode); + nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, bh_cp); return 0; @@ -977,6 +980,33 @@ static void nilfs_segctor_fill_in_file_bmap(struct nilfs_sc_info *sci) } } +/** + * nilfs_write_root_mdt_inode - export root metadata inode information to + * the on-disk inode + * @inode: inode object of the root metadata file + * @raw_inode: on-disk inode + * + * nilfs_write_root_mdt_inode() writes inode information and bmap data of + * @inode to the inode area of the metadata file allocated on the super root + * block created to finalize the log. Since super root blocks are configured + * each time, this function zero-fills the unused area of @raw_inode. + */ +static void nilfs_write_root_mdt_inode(struct inode *inode, + struct nilfs_inode *raw_inode) +{ + struct the_nilfs *nilfs = inode->i_sb->s_fs_info; + + nilfs_write_inode_common(inode, raw_inode); + + /* zero-fill unused portion of raw_inode */ + raw_inode->i_xattr = 0; + raw_inode->i_pad = 0; + memset((void *)raw_inode + sizeof(*raw_inode), 0, + nilfs->ns_inode_size - sizeof(*raw_inode)); + + nilfs_bmap_write(NILFS_I(inode)->i_bmap, raw_inode); +} + static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci, struct the_nilfs *nilfs) { @@ -998,12 +1028,13 @@ static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci, nilfs->ns_nongc_ctime : sci->sc_seg_ctime); raw_sr->sr_flags = 0; - nilfs_write_inode_common(nilfs->ns_dat, (void *)raw_sr + - NILFS_SR_DAT_OFFSET(isz), 1); - nilfs_write_inode_common(nilfs->ns_cpfile, (void *)raw_sr + - NILFS_SR_CPFILE_OFFSET(isz), 1); - nilfs_write_inode_common(nilfs->ns_sufile, (void *)raw_sr + - NILFS_SR_SUFILE_OFFSET(isz), 1); + nilfs_write_root_mdt_inode(nilfs->ns_dat, (void *)raw_sr + + NILFS_SR_DAT_OFFSET(isz)); + nilfs_write_root_mdt_inode(nilfs->ns_cpfile, (void *)raw_sr + + NILFS_SR_CPFILE_OFFSET(isz)); + nilfs_write_root_mdt_inode(nilfs->ns_sufile, (void *)raw_sr + + NILFS_SR_SUFILE_OFFSET(isz)); + memset((void *)raw_sr + srsz, 0, nilfs->ns_blocksize - srsz); set_buffer_uptodate(bh_sr); unlock_buffer(bh_sr); From f422ef6111d54c192bcc7e0c2d381a5a90299583 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Mon, 22 Jan 2024 23:01:56 +0900 Subject: [PATCH 597/707] nilfs2: do not acquire rwsem in nilfs_bmap_write() It is now clear that nilfs_bmap_write() is only used to finalize logs written to disk. Concurrent bmap modification operations are not performed on bmaps in this context. Additionally, this function does not modify data used in read-only operations such as bmap lookups. Therefore, there is no need to acquire bmap->b_sem in nilfs_bmap_write(), so delete it. Link: https://lkml.kernel.org/r/20240122140202.6950-10-konishi.ryusuke@gmail.com Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/nilfs2/bmap.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c index 7a8f166f2c8d84..383f0afa2cea36 100644 --- a/fs/nilfs2/bmap.c +++ b/fs/nilfs2/bmap.c @@ -548,13 +548,10 @@ int nilfs_bmap_read(struct nilfs_bmap *bmap, struct nilfs_inode *raw_inode) */ void nilfs_bmap_write(struct nilfs_bmap *bmap, struct nilfs_inode *raw_inode) { - down_write(&bmap->b_sem); memcpy(raw_inode->i_bmap, bmap->b_u.u_data, NILFS_INODE_BMAP_SIZE * sizeof(__le64)); if (bmap->b_inode->i_ino == NILFS_DAT_INO) bmap->b_last_allocated_ptr = NILFS_BMAP_NEW_PTR_INIT; - - up_write(&bmap->b_sem); } void nilfs_bmap_init_gc(struct nilfs_bmap *bmap) From 2d97cae8cc4853f622ca6a05ac7d63d455bd58f1 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Mon, 22 Jan 2024 23:01:57 +0900 Subject: [PATCH 598/707] nilfs2: convert ifile to use kmap_local Convert deprecated kmap() and kmap_atomic() to use kmap_local for the ifile metadata file used to manage disk inodes. In some usages, calls to kmap_local and kunmap_local are split into different helpers, but those usages can be safely changed to local thread kmap. Link: https://lkml.kernel.org/r/20240122140202.6950-11-konishi.ryusuke@gmail.com Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/nilfs2/ifile.c | 4 ++-- fs/nilfs2/ifile.h | 7 +++---- fs/nilfs2/inode.c | 6 +++--- fs/nilfs2/segment.c | 2 +- 4 files changed, 9 insertions(+), 10 deletions(-) diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c index a8a4bc8490b4d8..e9538fa46ff27d 100644 --- a/fs/nilfs2/ifile.c +++ b/fs/nilfs2/ifile.c @@ -115,11 +115,11 @@ int nilfs_ifile_delete_inode(struct inode *ifile, ino_t ino) return ret; } - kaddr = kmap_atomic(req.pr_entry_bh->b_page); + kaddr = kmap_local_page(req.pr_entry_bh->b_page); raw_inode = nilfs_palloc_block_get_entry(ifile, req.pr_entry_nr, req.pr_entry_bh, kaddr); raw_inode->i_flags = 0; - kunmap_atomic(kaddr); + kunmap_local(kaddr); mark_buffer_dirty(req.pr_entry_bh); brelse(req.pr_entry_bh); diff --git a/fs/nilfs2/ifile.h b/fs/nilfs2/ifile.h index 35c5273f48219b..b71ab0a81dc45e 100644 --- a/fs/nilfs2/ifile.h +++ b/fs/nilfs2/ifile.h @@ -21,15 +21,14 @@ static inline struct nilfs_inode * nilfs_ifile_map_inode(struct inode *ifile, ino_t ino, struct buffer_head *ibh) { - void *kaddr = kmap(ibh->b_page); + void *kaddr = kmap_local_page(ibh->b_page); return nilfs_palloc_block_get_entry(ifile, ino, ibh, kaddr); } -static inline void nilfs_ifile_unmap_inode(struct inode *ifile, ino_t ino, - struct buffer_head *ibh) +static inline void nilfs_ifile_unmap_inode(struct nilfs_inode *raw_inode) { - kunmap(ibh->b_page); + kunmap_local(raw_inode); } int nilfs_ifile_create_inode(struct inode *, ino_t *, struct buffer_head **); diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index b9d40f5e94d32a..a475095a5e80b7 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -520,7 +520,7 @@ static int __nilfs_read_inode(struct super_block *sb, inode, inode->i_mode, huge_decode_dev(le64_to_cpu(raw_inode->i_device_code))); } - nilfs_ifile_unmap_inode(root->ifile, ino, bh); + nilfs_ifile_unmap_inode(raw_inode); brelse(bh); up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem); nilfs_set_inode_flags(inode); @@ -529,7 +529,7 @@ static int __nilfs_read_inode(struct super_block *sb, return 0; failed_unmap: - nilfs_ifile_unmap_inode(root->ifile, ino, bh); + nilfs_ifile_unmap_inode(raw_inode); brelse(bh); bad_inode: @@ -814,7 +814,7 @@ void nilfs_update_inode(struct inode *inode, struct buffer_head *ibh, int flags) raw_inode->i_device_code = cpu_to_le64(huge_encode_dev(inode->i_rdev)); - nilfs_ifile_unmap_inode(ifile, ino, ibh); + nilfs_ifile_unmap_inode(raw_inode); } #define NILFS_MAX_TRUNCATE_BLOCKS 16384 /* 64MB for 4KB block */ diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index b512e772846568..3657918328ea6f 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -966,7 +966,7 @@ static void nilfs_fill_in_file_bmap(struct inode *ifile, raw_inode = nilfs_ifile_map_inode(ifile, ii->vfs_inode.i_ino, ibh); nilfs_bmap_write(ii->i_bmap, raw_inode); - nilfs_ifile_unmap_inode(ifile, ii->vfs_inode.i_ino, ibh); + nilfs_ifile_unmap_inode(raw_inode); } } From 5c8436d4b3c119a00a98926106f894638f8c3cbb Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Mon, 22 Jan 2024 23:01:58 +0900 Subject: [PATCH 599/707] nilfs2: localize highmem mapping for checkpoint creation within cpfile In order to convert kmap() used in cpfile to kmap_local, first move the checkpoint creation routine, which is one of the places where kmap is used, to the cpfile side and make the page mapping local and temporary. And use kmap_local instead of kmap to access the checkpoint entry page (and header block page) when generating a checkpoint. Link: https://lkml.kernel.org/r/20240122140202.6950-12-konishi.ryusuke@gmail.com Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/nilfs2/cpfile.c | 74 +++++++++++++++++++++++++++++++++++++++++++++ fs/nilfs2/cpfile.h | 1 + fs/nilfs2/segment.c | 31 ++----------------- 3 files changed, 77 insertions(+), 29 deletions(-) diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c index 39136637f7155b..f62da80e530a73 100644 --- a/fs/nilfs2/cpfile.c +++ b/fs/nilfs2/cpfile.c @@ -272,6 +272,80 @@ int nilfs_cpfile_get_checkpoint(struct inode *cpfile, return ret; } +/** + * nilfs_cpfile_create_checkpoint - create a checkpoint entry on cpfile + * @cpfile: checkpoint file inode + * @cno: number of checkpoint to set up + * + * This function creates a checkpoint with the number specified by @cno on + * cpfile. If the specified checkpoint entry already exists due to a past + * failure, it will be reused without returning an error. + * In either case, the buffer of the block containing the checkpoint entry + * and the cpfile inode are made dirty for inclusion in the write log. + * + * Return: 0 on success, or the following negative error code on failure. + * * %-ENOMEM - Insufficient memory available. + * * %-EIO - I/O error (including metadata corruption). + * * %-EROFS - Read only filesystem + */ +int nilfs_cpfile_create_checkpoint(struct inode *cpfile, __u64 cno) +{ + struct buffer_head *header_bh, *cp_bh; + struct nilfs_cpfile_header *header; + struct nilfs_checkpoint *cp; + void *kaddr; + int ret; + + if (WARN_ON_ONCE(cno < 1)) + return -EIO; + + down_write(&NILFS_MDT(cpfile)->mi_sem); + ret = nilfs_cpfile_get_header_block(cpfile, &header_bh); + if (unlikely(ret < 0)) { + if (ret == -ENOENT) { + nilfs_error(cpfile->i_sb, + "checkpoint creation failed due to metadata corruption."); + ret = -EIO; + } + goto out_sem; + } + ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 1, &cp_bh); + if (unlikely(ret < 0)) + goto out_header; + + kaddr = kmap_local_page(cp_bh->b_page); + cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr); + if (nilfs_checkpoint_invalid(cp)) { + /* a newly-created checkpoint */ + nilfs_checkpoint_clear_invalid(cp); + if (!nilfs_cpfile_is_in_first(cpfile, cno)) + nilfs_cpfile_block_add_valid_checkpoints(cpfile, cp_bh, + kaddr, 1); + kunmap_local(kaddr); + + kaddr = kmap_local_page(header_bh->b_page); + header = nilfs_cpfile_block_get_header(cpfile, header_bh, + kaddr); + le64_add_cpu(&header->ch_ncheckpoints, 1); + kunmap_local(kaddr); + mark_buffer_dirty(header_bh); + } else { + kunmap_local(kaddr); + } + + /* Force the buffer and the inode to become dirty */ + mark_buffer_dirty(cp_bh); + brelse(cp_bh); + nilfs_mdt_mark_dirty(cpfile); + +out_header: + brelse(header_bh); + +out_sem: + up_write(&NILFS_MDT(cpfile)->mi_sem); + return ret; +} + /** * nilfs_cpfile_put_checkpoint - put a checkpoint * @cpfile: inode of checkpoint file diff --git a/fs/nilfs2/cpfile.h b/fs/nilfs2/cpfile.h index edabb2dc57567c..fcb1a94097b3f9 100644 --- a/fs/nilfs2/cpfile.h +++ b/fs/nilfs2/cpfile.h @@ -19,6 +19,7 @@ int nilfs_cpfile_get_checkpoint(struct inode *, __u64, int, struct nilfs_checkpoint **, struct buffer_head **); +int nilfs_cpfile_create_checkpoint(struct inode *cpfile, __u64 cno); void nilfs_cpfile_put_checkpoint(struct inode *, __u64, struct buffer_head *); int nilfs_cpfile_delete_checkpoints(struct inode *, __u64, __u64); int nilfs_cpfile_delete_checkpoint(struct inode *, __u64); diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 3657918328ea6f..37d06eacec6394 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -880,34 +880,6 @@ static void nilfs_segctor_clear_metadata_dirty(struct nilfs_sc_info *sci) nilfs_mdt_clear_dirty(nilfs->ns_dat); } -static int nilfs_segctor_create_checkpoint(struct nilfs_sc_info *sci) -{ - struct the_nilfs *nilfs = sci->sc_super->s_fs_info; - struct buffer_head *bh_cp; - struct nilfs_checkpoint *raw_cp; - int err; - - /* XXX: this interface will be changed */ - err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, 1, - &raw_cp, &bh_cp); - if (likely(!err)) { - /* - * The following code is duplicated with cpfile. But, it is - * needed to collect the checkpoint even if it was not newly - * created. - */ - mark_buffer_dirty(bh_cp); - nilfs_mdt_mark_dirty(nilfs->ns_cpfile); - nilfs_cpfile_put_checkpoint( - nilfs->ns_cpfile, nilfs->ns_cno, bh_cp); - } else if (err == -EINVAL || err == -ENOENT) { - nilfs_error(sci->sc_super, - "checkpoint creation failed due to metadata corruption."); - err = -EIO; - } - return err; -} - static int nilfs_segctor_fill_in_checkpoint(struct nilfs_sc_info *sci) { struct the_nilfs *nilfs = sci->sc_super->s_fs_info; @@ -1261,7 +1233,8 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode) break; nilfs_sc_cstage_inc(sci); /* Creating a checkpoint */ - err = nilfs_segctor_create_checkpoint(sci); + err = nilfs_cpfile_create_checkpoint(nilfs->ns_cpfile, + nilfs->ns_cno); if (unlikely(err)) break; fallthrough; From f477587f6cedd74ebbd92f25c28a551893014cdd Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Mon, 22 Jan 2024 23:01:59 +0900 Subject: [PATCH 600/707] nilfs2: localize highmem mapping for checkpoint finalization within cpfile Move the checkpoint finalization routine to the cpfile side, and make the page mapping local and temporary. And use kmap_local instead of kmap to access the checkpoint entry page when finalizing a checkpoint. In this conversion, some of the information on the checkpoint entry being rewritten is passed through the arguments of the newly added method nilfs_cpfile_finalize_checkpoint(). Link: https://lkml.kernel.org/r/20240122140202.6950-13-konishi.ryusuke@gmail.com Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/nilfs2/cpfile.c | 74 +++++++++++++++++++++++++++++++++++++++++++++ fs/nilfs2/cpfile.h | 3 ++ fs/nilfs2/segment.c | 51 +++---------------------------- 3 files changed, 82 insertions(+), 46 deletions(-) diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c index f62da80e530a73..3af77252e08141 100644 --- a/fs/nilfs2/cpfile.c +++ b/fs/nilfs2/cpfile.c @@ -363,6 +363,80 @@ void nilfs_cpfile_put_checkpoint(struct inode *cpfile, __u64 cno, brelse(bh); } +/** + * nilfs_cpfile_finalize_checkpoint - fill in a checkpoint entry in cpfile + * @cpfile: checkpoint file inode + * @cno: checkpoint number + * @root: nilfs root object + * @blkinc: number of blocks added by this checkpoint + * @ctime: checkpoint creation time + * @minor: minor checkpoint flag + * + * This function completes the checkpoint entry numbered by @cno in the + * cpfile with the data given by the arguments @root, @blkinc, @ctime, and + * @minor. + * + * Return: 0 on success, or the following negative error code on failure. + * * %-ENOMEM - Insufficient memory available. + * * %-EIO - I/O error (including metadata corruption). + */ +int nilfs_cpfile_finalize_checkpoint(struct inode *cpfile, __u64 cno, + struct nilfs_root *root, __u64 blkinc, + time64_t ctime, bool minor) +{ + struct buffer_head *cp_bh; + struct nilfs_checkpoint *cp; + void *kaddr; + int ret; + + if (WARN_ON_ONCE(cno < 1)) + return -EIO; + + down_write(&NILFS_MDT(cpfile)->mi_sem); + ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh); + if (unlikely(ret < 0)) { + if (ret == -ENOENT) + goto error; + goto out_sem; + } + + kaddr = kmap_local_page(cp_bh->b_page); + cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr); + if (unlikely(nilfs_checkpoint_invalid(cp))) { + kunmap_local(kaddr); + brelse(cp_bh); + goto error; + } + + cp->cp_snapshot_list.ssl_next = 0; + cp->cp_snapshot_list.ssl_prev = 0; + cp->cp_inodes_count = cpu_to_le64(atomic64_read(&root->inodes_count)); + cp->cp_blocks_count = cpu_to_le64(atomic64_read(&root->blocks_count)); + cp->cp_nblk_inc = cpu_to_le64(blkinc); + cp->cp_create = cpu_to_le64(ctime); + cp->cp_cno = cpu_to_le64(cno); + + if (minor) + nilfs_checkpoint_set_minor(cp); + else + nilfs_checkpoint_clear_minor(cp); + + nilfs_write_inode_common(root->ifile, &cp->cp_ifile_inode); + nilfs_bmap_write(NILFS_I(root->ifile)->i_bmap, &cp->cp_ifile_inode); + + kunmap_local(kaddr); + brelse(cp_bh); +out_sem: + up_write(&NILFS_MDT(cpfile)->mi_sem); + return ret; + +error: + nilfs_error(cpfile->i_sb, + "checkpoint finalization failed due to metadata corruption."); + ret = -EIO; + goto out_sem; +} + /** * nilfs_cpfile_delete_checkpoints - delete checkpoints * @cpfile: inode of checkpoint file diff --git a/fs/nilfs2/cpfile.h b/fs/nilfs2/cpfile.h index fcb1a94097b3f9..aa1408a3af010e 100644 --- a/fs/nilfs2/cpfile.h +++ b/fs/nilfs2/cpfile.h @@ -21,6 +21,9 @@ int nilfs_cpfile_get_checkpoint(struct inode *, __u64, int, struct buffer_head **); int nilfs_cpfile_create_checkpoint(struct inode *cpfile, __u64 cno); void nilfs_cpfile_put_checkpoint(struct inode *, __u64, struct buffer_head *); +int nilfs_cpfile_finalize_checkpoint(struct inode *cpfile, __u64 cno, + struct nilfs_root *root, __u64 blkinc, + time64_t ctime, bool minor); int nilfs_cpfile_delete_checkpoints(struct inode *, __u64, __u64); int nilfs_cpfile_delete_checkpoint(struct inode *, __u64); int nilfs_cpfile_change_cpmode(struct inode *, __u64, int); diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 37d06eacec6394..ecf778f44bbc85 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -880,51 +880,6 @@ static void nilfs_segctor_clear_metadata_dirty(struct nilfs_sc_info *sci) nilfs_mdt_clear_dirty(nilfs->ns_dat); } -static int nilfs_segctor_fill_in_checkpoint(struct nilfs_sc_info *sci) -{ - struct the_nilfs *nilfs = sci->sc_super->s_fs_info; - struct buffer_head *bh_cp; - struct nilfs_checkpoint *raw_cp; - struct inode *ifile; - int err; - - err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, 0, - &raw_cp, &bh_cp); - if (unlikely(err)) { - if (err == -EINVAL || err == -ENOENT) { - nilfs_error(sci->sc_super, - "checkpoint finalization failed due to metadata corruption."); - err = -EIO; - } - goto failed_ibh; - } - raw_cp->cp_snapshot_list.ssl_next = 0; - raw_cp->cp_snapshot_list.ssl_prev = 0; - raw_cp->cp_inodes_count = - cpu_to_le64(atomic64_read(&sci->sc_root->inodes_count)); - raw_cp->cp_blocks_count = - cpu_to_le64(atomic64_read(&sci->sc_root->blocks_count)); - raw_cp->cp_nblk_inc = - cpu_to_le64(sci->sc_nblk_inc + sci->sc_nblk_this_inc); - raw_cp->cp_create = cpu_to_le64(sci->sc_seg_ctime); - raw_cp->cp_cno = cpu_to_le64(nilfs->ns_cno); - - if (test_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags)) - nilfs_checkpoint_clear_minor(raw_cp); - else - nilfs_checkpoint_set_minor(raw_cp); - - ifile = sci->sc_root->ifile; - nilfs_write_inode_common(ifile, &raw_cp->cp_ifile_inode); - nilfs_bmap_write(NILFS_I(ifile)->i_bmap, &raw_cp->cp_ifile_inode); - - nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, bh_cp); - return 0; - - failed_ibh: - return err; -} - static void nilfs_fill_in_file_bmap(struct inode *ifile, struct nilfs_inode_info *ii) @@ -2103,7 +2058,11 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode) if (mode == SC_LSEG_SR && nilfs_sc_cstage_get(sci) >= NILFS_ST_CPFILE) { - err = nilfs_segctor_fill_in_checkpoint(sci); + err = nilfs_cpfile_finalize_checkpoint( + nilfs->ns_cpfile, nilfs->ns_cno, sci->sc_root, + sci->sc_nblk_inc + sci->sc_nblk_this_inc, + sci->sc_seg_ctime, + !test_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags)); if (unlikely(err)) goto failed_to_write; From 5fb49887991cdfb5d8284c97cc27a4ac9d8722da Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Mon, 22 Jan 2024 23:02:00 +0900 Subject: [PATCH 601/707] nilfs2: localize highmem mapping for checkpoint reading within cpfile Move the code for reading from a checkpoint entry that is performed in nilfs_attach_checkpoint() to the cpfile side, and make the page mapping local and temporary. And use kmap_local instead of kmap to access the checkpoint entry page. In order to load the ifile inode information included in the checkpoint entry within the inode lock section of nilfs_ifile_read(), the newly added checkpoint reading method nilfs_cpfile_read_checkpoint() is called indirectly via nilfs_ifile_read() instead of from nilfs_attach_checkpoint(). Link: https://lkml.kernel.org/r/20240122140202.6950-14-konishi.ryusuke@gmail.com Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/nilfs2/cpfile.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++ fs/nilfs2/cpfile.h | 2 ++ fs/nilfs2/ifile.c | 17 ++++++++---- fs/nilfs2/ifile.h | 3 +- fs/nilfs2/super.c | 31 ++++----------------- 5 files changed, 87 insertions(+), 34 deletions(-) diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c index 3af77252e08141..56e38843536b96 100644 --- a/fs/nilfs2/cpfile.c +++ b/fs/nilfs2/cpfile.c @@ -186,6 +186,74 @@ static inline int nilfs_cpfile_delete_checkpoint_block(struct inode *cpfile, nilfs_cpfile_get_blkoff(cpfile, cno)); } +/** + * nilfs_cpfile_read_checkpoint - read a checkpoint entry in cpfile + * @cpfile: checkpoint file inode + * @cno: number of checkpoint entry to read + * @root: nilfs root object + * @ifile: ifile's inode to read and attach to @root + * + * This function imports checkpoint information from the checkpoint file and + * stores it to the inode file given by @ifile and the nilfs root object + * given by @root. + * + * Return: 0 on success, or the following negative error code on failure. + * * %-EINVAL - Invalid checkpoint. + * * %-ENOMEM - Insufficient memory available. + * * %-EIO - I/O error (including metadata corruption). + */ +int nilfs_cpfile_read_checkpoint(struct inode *cpfile, __u64 cno, + struct nilfs_root *root, struct inode *ifile) +{ + struct buffer_head *cp_bh; + struct nilfs_checkpoint *cp; + void *kaddr; + int ret; + + if (cno < 1 || cno > nilfs_mdt_cno(cpfile)) + return -EINVAL; + + down_read(&NILFS_MDT(cpfile)->mi_sem); + ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh); + if (unlikely(ret < 0)) { + if (ret == -ENOENT) + ret = -EINVAL; + goto out_sem; + } + + kaddr = kmap_local_page(cp_bh->b_page); + cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr); + if (nilfs_checkpoint_invalid(cp)) { + ret = -EINVAL; + goto put_cp; + } + + ret = nilfs_read_inode_common(ifile, &cp->cp_ifile_inode); + if (unlikely(ret)) { + /* + * Since this inode is on a checkpoint entry, treat errors + * as metadata corruption. + */ + nilfs_err(cpfile->i_sb, + "ifile inode (checkpoint number=%llu) corrupted", + (unsigned long long)cno); + ret = -EIO; + goto put_cp; + } + + /* Configure the nilfs root object */ + atomic64_set(&root->inodes_count, le64_to_cpu(cp->cp_inodes_count)); + atomic64_set(&root->blocks_count, le64_to_cpu(cp->cp_blocks_count)); + root->ifile = ifile; + +put_cp: + kunmap_local(kaddr); + brelse(cp_bh); +out_sem: + up_read(&NILFS_MDT(cpfile)->mi_sem); + return ret; +} + /** * nilfs_cpfile_get_checkpoint - get a checkpoint * @cpfile: inode of checkpoint file diff --git a/fs/nilfs2/cpfile.h b/fs/nilfs2/cpfile.h index aa1408a3af010e..2cfa14011bc832 100644 --- a/fs/nilfs2/cpfile.h +++ b/fs/nilfs2/cpfile.h @@ -19,6 +19,8 @@ int nilfs_cpfile_get_checkpoint(struct inode *, __u64, int, struct nilfs_checkpoint **, struct buffer_head **); +int nilfs_cpfile_read_checkpoint(struct inode *cpfile, __u64 cno, + struct nilfs_root *root, struct inode *ifile); int nilfs_cpfile_create_checkpoint(struct inode *cpfile, __u64 cno); void nilfs_cpfile_put_checkpoint(struct inode *, __u64, struct buffer_head *); int nilfs_cpfile_finalize_checkpoint(struct inode *cpfile, __u64 cno, diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c index e9538fa46ff27d..612e609158b520 100644 --- a/fs/nilfs2/ifile.c +++ b/fs/nilfs2/ifile.c @@ -15,6 +15,7 @@ #include "mdt.h" #include "alloc.h" #include "ifile.h" +#include "cpfile.h" /** * struct nilfs_ifile_info - on-memory private data of ifile @@ -173,14 +174,18 @@ int nilfs_ifile_count_free_inodes(struct inode *ifile, * nilfs_ifile_read - read or get ifile inode * @sb: super block instance * @root: root object + * @cno: number of checkpoint entry to read * @inode_size: size of an inode - * @raw_inode: on-disk ifile inode - * @inodep: buffer to store the inode + * + * Return: 0 on success, or the following negative error code on failure. + * * %-EINVAL - Invalid checkpoint. + * * %-ENOMEM - Insufficient memory available. + * * %-EIO - I/O error (including metadata corruption). */ int nilfs_ifile_read(struct super_block *sb, struct nilfs_root *root, - size_t inode_size, struct nilfs_inode *raw_inode, - struct inode **inodep) + __u64 cno, size_t inode_size) { + struct the_nilfs *nilfs; struct inode *ifile; int err; @@ -201,13 +206,13 @@ int nilfs_ifile_read(struct super_block *sb, struct nilfs_root *root, nilfs_palloc_setup_cache(ifile, &NILFS_IFILE_I(ifile)->palloc_cache); - err = nilfs_read_inode_common(ifile, raw_inode); + nilfs = sb->s_fs_info; + err = nilfs_cpfile_read_checkpoint(nilfs->ns_cpfile, cno, root, ifile); if (err) goto failed; unlock_new_inode(ifile); out: - *inodep = ifile; return 0; failed: iget_failed(ifile); diff --git a/fs/nilfs2/ifile.h b/fs/nilfs2/ifile.h index b71ab0a81dc45e..625545cc2a989f 100644 --- a/fs/nilfs2/ifile.h +++ b/fs/nilfs2/ifile.h @@ -38,7 +38,6 @@ int nilfs_ifile_get_inode_block(struct inode *, ino_t, struct buffer_head **); int nilfs_ifile_count_free_inodes(struct inode *, u64 *, u64 *); int nilfs_ifile_read(struct super_block *sb, struct nilfs_root *root, - size_t inode_size, struct nilfs_inode *raw_inode, - struct inode **inodep); + __u64 cno, size_t inode_size); #endif /* _NILFS_IFILE_H */ diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index df8674173b2202..5e630c179a1e29 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -544,8 +544,6 @@ int nilfs_attach_checkpoint(struct super_block *sb, __u64 cno, int curr_mnt, { struct the_nilfs *nilfs = sb->s_fs_info; struct nilfs_root *root; - struct nilfs_checkpoint *raw_cp; - struct buffer_head *bh_cp; int err = -ENOMEM; root = nilfs_find_or_create_root( @@ -557,38 +555,19 @@ int nilfs_attach_checkpoint(struct super_block *sb, __u64 cno, int curr_mnt, goto reuse; /* already attached checkpoint */ down_read(&nilfs->ns_segctor_sem); - err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, cno, 0, &raw_cp, - &bh_cp); + err = nilfs_ifile_read(sb, root, cno, nilfs->ns_inode_size); up_read(&nilfs->ns_segctor_sem); - if (unlikely(err)) { - if (err == -ENOENT || err == -EINVAL) { - nilfs_err(sb, - "Invalid checkpoint (checkpoint number=%llu)", - (unsigned long long)cno); - err = -EINVAL; - } + if (unlikely(err)) goto failed; - } - - err = nilfs_ifile_read(sb, root, nilfs->ns_inode_size, - &raw_cp->cp_ifile_inode, &root->ifile); - if (err) - goto failed_bh; - - atomic64_set(&root->inodes_count, - le64_to_cpu(raw_cp->cp_inodes_count)); - atomic64_set(&root->blocks_count, - le64_to_cpu(raw_cp->cp_blocks_count)); - - nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, cno, bh_cp); reuse: *rootp = root; return 0; - failed_bh: - nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, cno, bh_cp); failed: + if (err == -EINVAL) + nilfs_err(sb, "Invalid checkpoint (checkpoint number=%llu)", + (unsigned long long)cno); nilfs_put_root(root); return err; From 71488bdc373d2884b4323c7427ebc316b4eaab6a Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Mon, 22 Jan 2024 23:02:01 +0900 Subject: [PATCH 602/707] nilfs2: remove nilfs_cpfile_{get,put}_checkpoint() All calls to nilfs_cpfile_get_checkpoint() and nilfs_cpfile_put_checkpoint() that call kmap() and kunmap() separately are now gone, so remove these methods. Link: https://lkml.kernel.org/r/20240122140202.6950-15-konishi.ryusuke@gmail.com Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/nilfs2/cpfile.c | 103 --------------------------------------------- fs/nilfs2/cpfile.h | 4 -- 2 files changed, 107 deletions(-) diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c index 56e38843536b96..b5bad332d630c5 100644 --- a/fs/nilfs2/cpfile.c +++ b/fs/nilfs2/cpfile.c @@ -254,92 +254,6 @@ int nilfs_cpfile_read_checkpoint(struct inode *cpfile, __u64 cno, return ret; } -/** - * nilfs_cpfile_get_checkpoint - get a checkpoint - * @cpfile: inode of checkpoint file - * @cno: checkpoint number - * @create: create flag - * @cpp: pointer to a checkpoint - * @bhp: pointer to a buffer head - * - * Description: nilfs_cpfile_get_checkpoint() acquires the checkpoint - * specified by @cno. A new checkpoint will be created if @cno is the current - * checkpoint number and @create is nonzero. - * - * Return Value: On success, 0 is returned, and the checkpoint and the - * buffer head of the buffer on which the checkpoint is located are stored in - * the place pointed by @cpp and @bhp, respectively. On error, one of the - * following negative error codes is returned. - * - * %-EIO - I/O error. - * - * %-ENOMEM - Insufficient amount of memory available. - * - * %-ENOENT - No such checkpoint. - * - * %-EINVAL - invalid checkpoint. - */ -int nilfs_cpfile_get_checkpoint(struct inode *cpfile, - __u64 cno, - int create, - struct nilfs_checkpoint **cpp, - struct buffer_head **bhp) -{ - struct buffer_head *header_bh, *cp_bh; - struct nilfs_cpfile_header *header; - struct nilfs_checkpoint *cp; - void *kaddr; - int ret; - - if (unlikely(cno < 1 || cno > nilfs_mdt_cno(cpfile) || - (cno < nilfs_mdt_cno(cpfile) && create))) - return -EINVAL; - - down_write(&NILFS_MDT(cpfile)->mi_sem); - - ret = nilfs_cpfile_get_header_block(cpfile, &header_bh); - if (ret < 0) - goto out_sem; - ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, create, &cp_bh); - if (ret < 0) - goto out_header; - kaddr = kmap(cp_bh->b_page); - cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr); - if (nilfs_checkpoint_invalid(cp)) { - if (!create) { - kunmap(cp_bh->b_page); - brelse(cp_bh); - ret = -ENOENT; - goto out_header; - } - /* a newly-created checkpoint */ - nilfs_checkpoint_clear_invalid(cp); - if (!nilfs_cpfile_is_in_first(cpfile, cno)) - nilfs_cpfile_block_add_valid_checkpoints(cpfile, cp_bh, - kaddr, 1); - mark_buffer_dirty(cp_bh); - - kaddr = kmap_atomic(header_bh->b_page); - header = nilfs_cpfile_block_get_header(cpfile, header_bh, - kaddr); - le64_add_cpu(&header->ch_ncheckpoints, 1); - kunmap_atomic(kaddr); - mark_buffer_dirty(header_bh); - nilfs_mdt_mark_dirty(cpfile); - } - - if (cpp != NULL) - *cpp = cp; - *bhp = cp_bh; - - out_header: - brelse(header_bh); - - out_sem: - up_write(&NILFS_MDT(cpfile)->mi_sem); - return ret; -} - /** * nilfs_cpfile_create_checkpoint - create a checkpoint entry on cpfile * @cpfile: checkpoint file inode @@ -414,23 +328,6 @@ int nilfs_cpfile_create_checkpoint(struct inode *cpfile, __u64 cno) return ret; } -/** - * nilfs_cpfile_put_checkpoint - put a checkpoint - * @cpfile: inode of checkpoint file - * @cno: checkpoint number - * @bh: buffer head - * - * Description: nilfs_cpfile_put_checkpoint() releases the checkpoint - * specified by @cno. @bh must be the buffer head which has been returned by - * a previous call to nilfs_cpfile_get_checkpoint() with @cno. - */ -void nilfs_cpfile_put_checkpoint(struct inode *cpfile, __u64 cno, - struct buffer_head *bh) -{ - kunmap(bh->b_page); - brelse(bh); -} - /** * nilfs_cpfile_finalize_checkpoint - fill in a checkpoint entry in cpfile * @cpfile: checkpoint file inode diff --git a/fs/nilfs2/cpfile.h b/fs/nilfs2/cpfile.h index 2cfa14011bc832..f5b1d59289ebf8 100644 --- a/fs/nilfs2/cpfile.h +++ b/fs/nilfs2/cpfile.h @@ -16,13 +16,9 @@ #include /* nilfs_inode, nilfs_checkpoint */ -int nilfs_cpfile_get_checkpoint(struct inode *, __u64, int, - struct nilfs_checkpoint **, - struct buffer_head **); int nilfs_cpfile_read_checkpoint(struct inode *cpfile, __u64 cno, struct nilfs_root *root, struct inode *ifile); int nilfs_cpfile_create_checkpoint(struct inode *cpfile, __u64 cno); -void nilfs_cpfile_put_checkpoint(struct inode *, __u64, struct buffer_head *); int nilfs_cpfile_finalize_checkpoint(struct inode *cpfile, __u64 cno, struct nilfs_root *root, __u64 blkinc, time64_t ctime, bool minor); From ab52d122692776d5e0a29997f7d9e2d0b6d9f509 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Mon, 22 Jan 2024 23:02:02 +0900 Subject: [PATCH 603/707] nilfs2: convert cpfile to use kmap_local Convert all remaining usages of kmap_atomic in cpfile to kmap_local. Link: https://lkml.kernel.org/r/20240122140202.6950-16-konishi.ryusuke@gmail.com Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/nilfs2/cpfile.c | 90 +++++++++++++++++++++++----------------------- 1 file changed, 45 insertions(+), 45 deletions(-) diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c index b5bad332d630c5..2c57132584de74 100644 --- a/fs/nilfs2/cpfile.c +++ b/fs/nilfs2/cpfile.c @@ -460,7 +460,7 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile, continue; } - kaddr = kmap_atomic(cp_bh->b_page); + kaddr = kmap_local_page(cp_bh->b_page); cp = nilfs_cpfile_block_get_checkpoint( cpfile, cno, cp_bh, kaddr); nicps = 0; @@ -482,7 +482,7 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile, cpfile, cp_bh, kaddr, nicps); if (count == 0) { /* make hole */ - kunmap_atomic(kaddr); + kunmap_local(kaddr); brelse(cp_bh); ret = nilfs_cpfile_delete_checkpoint_block( @@ -497,18 +497,18 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile, } } - kunmap_atomic(kaddr); + kunmap_local(kaddr); brelse(cp_bh); } if (tnicps > 0) { - kaddr = kmap_atomic(header_bh->b_page); + kaddr = kmap_local_page(header_bh->b_page); header = nilfs_cpfile_block_get_header(cpfile, header_bh, kaddr); le64_add_cpu(&header->ch_ncheckpoints, -(u64)tnicps); mark_buffer_dirty(header_bh); nilfs_mdt_mark_dirty(cpfile); - kunmap_atomic(kaddr); + kunmap_local(kaddr); } brelse(header_bh); @@ -560,7 +560,7 @@ static ssize_t nilfs_cpfile_do_get_cpinfo(struct inode *cpfile, __u64 *cnop, } ncps = nilfs_cpfile_checkpoints_in_block(cpfile, cno, cur_cno); - kaddr = kmap_atomic(bh->b_page); + kaddr = kmap_local_page(bh->b_page); cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr); for (i = 0; i < ncps && n < nci; i++, cp = (void *)cp + cpsz) { if (!nilfs_checkpoint_invalid(cp)) { @@ -570,7 +570,7 @@ static ssize_t nilfs_cpfile_do_get_cpinfo(struct inode *cpfile, __u64 *cnop, n++; } } - kunmap_atomic(kaddr); + kunmap_local(kaddr); brelse(bh); } @@ -604,10 +604,10 @@ static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop, ret = nilfs_cpfile_get_header_block(cpfile, &bh); if (ret < 0) goto out; - kaddr = kmap_atomic(bh->b_page); + kaddr = kmap_local_page(bh->b_page); header = nilfs_cpfile_block_get_header(cpfile, bh, kaddr); curr = le64_to_cpu(header->ch_snapshot_list.ssl_next); - kunmap_atomic(kaddr); + kunmap_local(kaddr); brelse(bh); if (curr == 0) { ret = 0; @@ -625,7 +625,7 @@ static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop, ret = 0; /* No snapshots (started from a hole block) */ goto out; } - kaddr = kmap_atomic(bh->b_page); + kaddr = kmap_local_page(bh->b_page); while (n < nci) { cp = nilfs_cpfile_block_get_checkpoint(cpfile, curr, bh, kaddr); curr = ~(__u64)0; /* Terminator */ @@ -641,7 +641,7 @@ static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop, next_blkoff = nilfs_cpfile_get_blkoff(cpfile, next); if (curr_blkoff != next_blkoff) { - kunmap_atomic(kaddr); + kunmap_local(kaddr); brelse(bh); ret = nilfs_cpfile_get_checkpoint_block(cpfile, next, 0, &bh); @@ -649,12 +649,12 @@ static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop, WARN_ON(ret == -ENOENT); goto out; } - kaddr = kmap_atomic(bh->b_page); + kaddr = kmap_local_page(bh->b_page); } curr = next; curr_blkoff = next_blkoff; } - kunmap_atomic(kaddr); + kunmap_local(kaddr); brelse(bh); *cnop = curr; ret = n; @@ -763,24 +763,24 @@ static int nilfs_cpfile_set_snapshot(struct inode *cpfile, __u64 cno) ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh); if (ret < 0) goto out_sem; - kaddr = kmap_atomic(cp_bh->b_page); + kaddr = kmap_local_page(cp_bh->b_page); cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr); if (nilfs_checkpoint_invalid(cp)) { ret = -ENOENT; - kunmap_atomic(kaddr); + kunmap_local(kaddr); goto out_cp; } if (nilfs_checkpoint_snapshot(cp)) { ret = 0; - kunmap_atomic(kaddr); + kunmap_local(kaddr); goto out_cp; } - kunmap_atomic(kaddr); + kunmap_local(kaddr); ret = nilfs_cpfile_get_header_block(cpfile, &header_bh); if (ret < 0) goto out_cp; - kaddr = kmap_atomic(header_bh->b_page); + kaddr = kmap_local_page(header_bh->b_page); header = nilfs_cpfile_block_get_header(cpfile, header_bh, kaddr); list = &header->ch_snapshot_list; curr_bh = header_bh; @@ -792,13 +792,13 @@ static int nilfs_cpfile_set_snapshot(struct inode *cpfile, __u64 cno) prev_blkoff = nilfs_cpfile_get_blkoff(cpfile, prev); curr = prev; if (curr_blkoff != prev_blkoff) { - kunmap_atomic(kaddr); + kunmap_local(kaddr); brelse(curr_bh); ret = nilfs_cpfile_get_checkpoint_block(cpfile, curr, 0, &curr_bh); if (ret < 0) goto out_header; - kaddr = kmap_atomic(curr_bh->b_page); + kaddr = kmap_local_page(curr_bh->b_page); } curr_blkoff = prev_blkoff; cp = nilfs_cpfile_block_get_checkpoint( @@ -806,7 +806,7 @@ static int nilfs_cpfile_set_snapshot(struct inode *cpfile, __u64 cno) list = &cp->cp_snapshot_list; prev = le64_to_cpu(list->ssl_prev); } - kunmap_atomic(kaddr); + kunmap_local(kaddr); if (prev != 0) { ret = nilfs_cpfile_get_checkpoint_block(cpfile, prev, 0, @@ -818,29 +818,29 @@ static int nilfs_cpfile_set_snapshot(struct inode *cpfile, __u64 cno) get_bh(prev_bh); } - kaddr = kmap_atomic(curr_bh->b_page); + kaddr = kmap_local_page(curr_bh->b_page); list = nilfs_cpfile_block_get_snapshot_list( cpfile, curr, curr_bh, kaddr); list->ssl_prev = cpu_to_le64(cno); - kunmap_atomic(kaddr); + kunmap_local(kaddr); - kaddr = kmap_atomic(cp_bh->b_page); + kaddr = kmap_local_page(cp_bh->b_page); cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr); cp->cp_snapshot_list.ssl_next = cpu_to_le64(curr); cp->cp_snapshot_list.ssl_prev = cpu_to_le64(prev); nilfs_checkpoint_set_snapshot(cp); - kunmap_atomic(kaddr); + kunmap_local(kaddr); - kaddr = kmap_atomic(prev_bh->b_page); + kaddr = kmap_local_page(prev_bh->b_page); list = nilfs_cpfile_block_get_snapshot_list( cpfile, prev, prev_bh, kaddr); list->ssl_next = cpu_to_le64(cno); - kunmap_atomic(kaddr); + kunmap_local(kaddr); - kaddr = kmap_atomic(header_bh->b_page); + kaddr = kmap_local_page(header_bh->b_page); header = nilfs_cpfile_block_get_header(cpfile, header_bh, kaddr); le64_add_cpu(&header->ch_nsnapshots, 1); - kunmap_atomic(kaddr); + kunmap_local(kaddr); mark_buffer_dirty(prev_bh); mark_buffer_dirty(curr_bh); @@ -881,23 +881,23 @@ static int nilfs_cpfile_clear_snapshot(struct inode *cpfile, __u64 cno) ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh); if (ret < 0) goto out_sem; - kaddr = kmap_atomic(cp_bh->b_page); + kaddr = kmap_local_page(cp_bh->b_page); cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr); if (nilfs_checkpoint_invalid(cp)) { ret = -ENOENT; - kunmap_atomic(kaddr); + kunmap_local(kaddr); goto out_cp; } if (!nilfs_checkpoint_snapshot(cp)) { ret = 0; - kunmap_atomic(kaddr); + kunmap_local(kaddr); goto out_cp; } list = &cp->cp_snapshot_list; next = le64_to_cpu(list->ssl_next); prev = le64_to_cpu(list->ssl_prev); - kunmap_atomic(kaddr); + kunmap_local(kaddr); ret = nilfs_cpfile_get_header_block(cpfile, &header_bh); if (ret < 0) @@ -921,29 +921,29 @@ static int nilfs_cpfile_clear_snapshot(struct inode *cpfile, __u64 cno) get_bh(prev_bh); } - kaddr = kmap_atomic(next_bh->b_page); + kaddr = kmap_local_page(next_bh->b_page); list = nilfs_cpfile_block_get_snapshot_list( cpfile, next, next_bh, kaddr); list->ssl_prev = cpu_to_le64(prev); - kunmap_atomic(kaddr); + kunmap_local(kaddr); - kaddr = kmap_atomic(prev_bh->b_page); + kaddr = kmap_local_page(prev_bh->b_page); list = nilfs_cpfile_block_get_snapshot_list( cpfile, prev, prev_bh, kaddr); list->ssl_next = cpu_to_le64(next); - kunmap_atomic(kaddr); + kunmap_local(kaddr); - kaddr = kmap_atomic(cp_bh->b_page); + kaddr = kmap_local_page(cp_bh->b_page); cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr); cp->cp_snapshot_list.ssl_next = cpu_to_le64(0); cp->cp_snapshot_list.ssl_prev = cpu_to_le64(0); nilfs_checkpoint_clear_snapshot(cp); - kunmap_atomic(kaddr); + kunmap_local(kaddr); - kaddr = kmap_atomic(header_bh->b_page); + kaddr = kmap_local_page(header_bh->b_page); header = nilfs_cpfile_block_get_header(cpfile, header_bh, kaddr); le64_add_cpu(&header->ch_nsnapshots, -1); - kunmap_atomic(kaddr); + kunmap_local(kaddr); mark_buffer_dirty(next_bh); mark_buffer_dirty(prev_bh); @@ -1002,13 +1002,13 @@ int nilfs_cpfile_is_snapshot(struct inode *cpfile, __u64 cno) ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &bh); if (ret < 0) goto out; - kaddr = kmap_atomic(bh->b_page); + kaddr = kmap_local_page(bh->b_page); cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr); if (nilfs_checkpoint_invalid(cp)) ret = -ENOENT; else ret = nilfs_checkpoint_snapshot(cp); - kunmap_atomic(kaddr); + kunmap_local(kaddr); brelse(bh); out: @@ -1085,12 +1085,12 @@ int nilfs_cpfile_get_stat(struct inode *cpfile, struct nilfs_cpstat *cpstat) ret = nilfs_cpfile_get_header_block(cpfile, &bh); if (ret < 0) goto out_sem; - kaddr = kmap_atomic(bh->b_page); + kaddr = kmap_local_page(bh->b_page); header = nilfs_cpfile_block_get_header(cpfile, bh, kaddr); cpstat->cs_cno = nilfs_mdt_cno(cpfile); cpstat->cs_ncps = le64_to_cpu(header->ch_ncheckpoints); cpstat->cs_nsss = le64_to_cpu(header->ch_nsnapshots); - kunmap_atomic(kaddr); + kunmap_local(kaddr); brelse(bh); out_sem: From 834ec1d23052b3cc69ed409793e4d370fcda0ebb Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Thu, 25 Jan 2024 15:55:07 -0700 Subject: [PATCH 604/707] kbuild: raise the minimum supported version of LLVM to 13.0.1 Patch series "Bump the minimum supported version of LLVM to 13.0.1". This series bumps the minimum supported version of LLVM for building the kernel to 13.0.1. The first patch does the bump and all subsequent patches clean up all the various workarounds and checks for earlier versions. Quoting the first patch's commit message for those that were only on CC for the clean ups: When __builtin_mul_overflow() has arguments that differ in terms of signedness and width, LLVM may generate a libcall to __muloti4 because it performs the checks in terms of 65-bit multiplication. This issue becomes harder to hit (but still possible) after LLVM 12.0.0, which includes a special case for matching widths but different signs. To gain access to this special case, which the kernel can take advantage of when calls to __muloti4 appear, bump the minimum supported version of LLVM for building the kernel to 13.0.1. 13.0.1 was chosen because there is minimal impact to distribution support while allowing a few more workarounds to be dropped in the kernel source than if 12.0.0 were chosen. Looking at container images of up to date distribution versions: archlinux:latest clang version 16.0.6 debian:oldoldstable-slim clang version 7.0.1-8+deb10u2 (tags/RELEASE_701/final) debian:oldstable-slim Debian clang version 11.0.1-2 debian:stable-slim Debian clang version 14.0.6 debian:testing-slim Debian clang version 16.0.6 (19) debian:unstable-slim Debian clang version 16.0.6 (19) fedora:38 clang version 16.0.6 (Fedora 16.0.6-3.fc38) fedora:latest clang version 17.0.6 (Fedora 17.0.6-1.fc39) fedora:rawhide clang version 17.0.6 (Fedora 17.0.6-1.fc40) opensuse/leap:latest clang version 15.0.7 opensuse/tumbleweed:latest clang version 17.0.6 ubuntu:focal clang version 10.0.0-4ubuntu1 ubuntu:latest Ubuntu clang version 14.0.0-1ubuntu1.1 ubuntu:rolling Ubuntu clang version 16.0.6 (15) ubuntu:devel Ubuntu clang version 17.0.6 (3) The only distribution that gets left behind is Debian Bullseye, as the default version is 11.0.1; other distributions either have a newer version than 13.0.1 or one older than the current minimum of 11.0.0. Debian has easy access to more recent LLVM versions through apt.llvm.org, so this is not as much of a concern. There are also the kernel.org LLVM toolchains, which should work with distributions with glibc 2.28 and newer. Another benefit of slimming up the number of supported versions of LLVM for building the kernel is reducing the build capacity needed to support a matrix that builds with each supported version, which allows a matrix to reallocate the freed up build capacity towards something else, such as more configuration combinations. This passes my build matrix with all supported versions. This is based on Andrew's mm-nonmm-unstable to avoid trivial conflicts with my series to update the LLVM links across the repository [1] but I can easily rebase it to linux-kbuild if Masahiro would rather these patches go through there (and defer the conflict resolution to the merge window). [1]: https://lore.kernel.org/20240109-update-llvm-links-v1-0-eb09b59db071@kernel.org/ This patch (of 11): When __builtin_mul_overflow() has arguments that differ in terms of signedness and width, LLVM may generate a libcall to __muloti4 because it performs the checks in terms of 65-bit multiplication. This issue becomes harder to hit (but still possible) after LLVM 12.0.0, which includes a special case for matching widths but different signs. To gain access to this special case, which the kernel can take advantage of when calls to __muloti4 appear, bump the minimum supported version of LLVM for building the kernel to 13.0.1. 13.0.1 was chosen because there is minimal impact to distribution support while allowing a few more workarounds to be dropped in the kernel source than if 12.0.0 were chosen. Looking at container images of up to date distribution versions: archlinux:latest clang version 16.0.6 debian:oldoldstable-slim clang version 7.0.1-8+deb10u2 (tags/RELEASE_701/final) debian:oldstable-slim Debian clang version 11.0.1-2 debian:stable-slim Debian clang version 14.0.6 debian:testing-slim Debian clang version 16.0.6 (19) debian:unstable-slim Debian clang version 16.0.6 (19) fedora:38 clang version 16.0.6 (Fedora 16.0.6-3.fc38) fedora:latest clang version 17.0.6 (Fedora 17.0.6-1.fc39) fedora:rawhide clang version 17.0.6 (Fedora 17.0.6-1.fc40) opensuse/leap:latest clang version 15.0.7 opensuse/tumbleweed:latest clang version 17.0.6 ubuntu:focal clang version 10.0.0-4ubuntu1 ubuntu:latest Ubuntu clang version 14.0.0-1ubuntu1.1 ubuntu:rolling Ubuntu clang version 16.0.6 (15) ubuntu:devel Ubuntu clang version 17.0.6 (3) The only distribution that gets left behind is Debian Bullseye, as the default version is 11.0.1; other distributions either have a newer version than 13.0.1 or one older than the current minimum of 11.0.0. Debian has easy access to more recent LLVM versions through apt.llvm.org, so this is not as much of a concern. There are also the kernel.org LLVM toolchains, which should work with distributions with glibc 2.28 and newer. Another benefit of slimming up the number of supported versions of LLVM for building the kernel is reducing the build capacity needed to support a matrix that builds with each supported version, which allows a matrix to reallocate the freed up build capacity towards something else, such as more configuration combinations. Link: https://lkml.kernel.org/r/20240125-bump-min-llvm-ver-to-13-0-1-v1-0-f5ff9bda41c5@kernel.org Closes: https://github.com/ClangBuiltLinux/linux/issues/1975 Link: https://github.com/llvm/llvm-project/issues/38013 Link: https://github.com/llvm/llvm-project/commit/3203143f1356a4e4e3ada231156fc6da6e1a9f9d Link: https://mirrors.edge.kernel.org/pub/tools/llvm/ Link: https://lkml.kernel.org/r/20240125-bump-min-llvm-ver-to-13-0-1-v1-1-f5ff9bda41c5@kernel.org Signed-off-by: Nathan Chancellor Reviewed-by: Kees Cook Cc: Albert Ou Cc: "Aneesh Kumar K.V (IBM)" Cc: Ard Biesheuvel Cc: Borislav Petkov (AMD) Cc: Catalin Marinas Cc: Conor Dooley Cc: Dave Hansen Cc: Ingo Molnar Cc: Mark Rutland Cc: Masahiro Yamada Cc: Michael Ellerman Cc: Nathan Chancellor Cc: "Naveen N. Rao" Cc: Nicholas Piggin Cc: Nicolas Schier Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Russell King Cc: Thomas Gleixner Cc: Will Deacon Signed-off-by: Andrew Morton --- Documentation/process/changes.rst | 2 +- scripts/min-tool-version.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/process/changes.rst b/Documentation/process/changes.rst index 50b3d1cb11159b..d7306b8cad1378 100644 --- a/Documentation/process/changes.rst +++ b/Documentation/process/changes.rst @@ -30,7 +30,7 @@ you probably needn't concern yourself with pcmciautils. Program Minimal version Command to check the version ====================== =============== ======================================== GNU C 5.1 gcc --version -Clang/LLVM (optional) 11.0.0 clang --version +Clang/LLVM (optional) 13.0.1 clang --version Rust (optional) 1.74.1 rustc --version bindgen (optional) 0.65.1 bindgen --version GNU make 3.82 make --version diff --git a/scripts/min-tool-version.sh b/scripts/min-tool-version.sh index 9faa4d3d91e358..5d17022ee1f6f8 100755 --- a/scripts/min-tool-version.sh +++ b/scripts/min-tool-version.sh @@ -29,7 +29,7 @@ llvm) elif [ "$SRCARCH" = loongarch ]; then echo 18.0.0 else - echo 11.0.0 + echo 13.0.1 fi ;; rustc) From 8f20632b4f573e5063fec43000119ed47ac76d81 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Thu, 25 Jan 2024 15:55:08 -0700 Subject: [PATCH 605/707] Makefile: drop warn-stack-size plugin opt Now that the minimum supported version of LLVM for building the kernel has been bumped to 13.0.1, the inner ifeq statement is always false, as the build will fail during the configuration stage for older LLVM versions. This effectively reverts commit 24845dcb170e ("Makefile: LTO: have linker check -Wframe-larger-than") and its follow up fix, commit 0236526d76b8 ("Makefile: lto: Pass -warn-stack-size only on LLD < 13.0.0"). Link: https://lkml.kernel.org/r/20240125-bump-min-llvm-ver-to-13-0-1-v1-2-f5ff9bda41c5@kernel.org Signed-off-by: Nathan Chancellor Reviewed-by: Kees Cook Cc: Albert Ou Cc: "Aneesh Kumar K.V (IBM)" Cc: Ard Biesheuvel Cc: Borislav Petkov (AMD) Cc: Catalin Marinas Cc: Conor Dooley Cc: Dave Hansen Cc: Ingo Molnar Cc: Mark Rutland Cc: Masahiro Yamada Cc: Michael Ellerman Cc: "Naveen N. Rao" Cc: Nicholas Piggin Cc: Nicolas Schier Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Russell King Cc: Thomas Gleixner Cc: Will Deacon Signed-off-by: Andrew Morton --- Makefile | 8 -------- 1 file changed, 8 deletions(-) diff --git a/Makefile b/Makefile index 9869f57c3fb3e6..885b2940e20da2 100644 --- a/Makefile +++ b/Makefile @@ -951,14 +951,6 @@ CC_FLAGS_LTO += -fvisibility=hidden # Limit inlining across translation units to reduce binary size KBUILD_LDFLAGS += -mllvm -import-instr-limit=5 - -# Check for frame size exceeding threshold during prolog/epilog insertion -# when using lld < 13.0.0. -ifneq ($(CONFIG_FRAME_WARN),0) -ifeq ($(call test-lt, $(CONFIG_LLD_VERSION), 130000),y) -KBUILD_LDFLAGS += -plugin-opt=-warn-stack-size=$(CONFIG_FRAME_WARN) -endif -endif endif ifdef CONFIG_LTO From e3dee7f334341775b6641e70efd382d1359f43af Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Thu, 25 Jan 2024 15:55:09 -0700 Subject: [PATCH 606/707] x86: drop stack-alignment plugin opt Now that the minimum supported version of LLVM for building the kernel has been bumped to 13.0.1, the inner ifeq statement is always false, as the build will fail during the configuration stage for older LLVM versions. This effectively reverts part of commit b33fff07e3e3 ("x86, build: allow LTO to be selected") and its follow up fix, commit 2398ce80152a ("x86, lto: Pass -stack-alignment only on LLD < 13.0.0"). Link: https://lkml.kernel.org/r/20240125-bump-min-llvm-ver-to-13-0-1-v1-3-f5ff9bda41c5@kernel.org Signed-off-by: Nathan Chancellor Reviewed-by: Kees Cook Cc: Albert Ou Cc: "Aneesh Kumar K.V (IBM)" Cc: Ard Biesheuvel Cc: Borislav Petkov (AMD) Cc: Catalin Marinas Cc: Conor Dooley Cc: Dave Hansen Cc: Ingo Molnar Cc: Mark Rutland Cc: Masahiro Yamada Cc: Michael Ellerman Cc: "Naveen N. Rao" Cc: Nicholas Piggin Cc: Nicolas Schier Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Russell King Cc: Thomas Gleixner Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/x86/Makefile | 6 ------ 1 file changed, 6 deletions(-) diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 1a068de12a564f..de30a8b35c41cf 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -217,12 +217,6 @@ endif KBUILD_LDFLAGS += -m elf_$(UTS_MACHINE) -ifdef CONFIG_LTO_CLANG -ifeq ($(call test-lt, $(CONFIG_LLD_VERSION), 130000),y) -KBUILD_LDFLAGS += -plugin-opt=-stack-alignment=$(if $(CONFIG_X86_32),4,8) -endif -endif - ifdef CONFIG_X86_NEED_RELOCS LDFLAGS_vmlinux := --emit-relocs --discard-none else From a18f79c4145fbb59c90d8c9eca540cb514d43392 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Thu, 25 Jan 2024 15:55:10 -0700 Subject: [PATCH 607/707] ARM: remove Thumb2 __builtin_thread_pointer workaround for Clang Now that the minimum supported version of LLVM for building the kernel has been bumped to 13.0.1, the conditional expression added to get_current() by commit c1e42efacb9b ("ARM: 9151/1: Thumb2: avoid __builtin_thread_pointer() on Clang") is always true, as the build will fail during the configuration stage for older LLVM versions. Remove it, effectively reverting the aforementioned change. Link: https://lkml.kernel.org/r/20240125-bump-min-llvm-ver-to-13-0-1-v1-4-f5ff9bda41c5@kernel.org Signed-off-by: Nathan Chancellor Reviewed-by: Kees Cook Cc: Albert Ou Cc: "Aneesh Kumar K.V (IBM)" Cc: Ard Biesheuvel Cc: Borislav Petkov (AMD) Cc: Catalin Marinas Cc: Conor Dooley Cc: Dave Hansen Cc: Ingo Molnar Cc: Mark Rutland Cc: Masahiro Yamada Cc: Michael Ellerman Cc: "Naveen N. Rao" Cc: Nicholas Piggin Cc: Nicolas Schier Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Russell King Cc: Thomas Gleixner Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/arm/include/asm/current.h | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/arch/arm/include/asm/current.h b/arch/arm/include/asm/current.h index 1e1178bf176da6..5225cb1c803b16 100644 --- a/arch/arm/include/asm/current.h +++ b/arch/arm/include/asm/current.h @@ -18,18 +18,12 @@ static __always_inline __attribute_const__ struct task_struct *get_current(void) { struct task_struct *cur; -#if __has_builtin(__builtin_thread_pointer) && \ - defined(CONFIG_CURRENT_POINTER_IN_TPIDRURO) && \ - !(defined(CONFIG_THUMB2_KERNEL) && \ - defined(CONFIG_CC_IS_CLANG) && CONFIG_CLANG_VERSION < 130001) +#if __has_builtin(__builtin_thread_pointer) && defined(CONFIG_CURRENT_POINTER_IN_TPIDRURO) /* * Use the __builtin helper when available - this results in better * code, especially when using GCC in combination with the per-task * stack protector, as the compiler will recognize that it needs to * load the TLS register only once in every function. - * - * Clang < 13.0.1 gets this wrong for Thumb2 builds: - * https://github.com/ClangBuiltLinux/linux/issues/1485 */ cur = __builtin_thread_pointer(); #elif defined(CONFIG_CURRENT_POINTER_IN_TPIDRURO) || defined(CONFIG_SMP) From 1633a42d2cf19ebe8b35d076ea72f6069d53e495 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Thu, 25 Jan 2024 15:55:11 -0700 Subject: [PATCH 608/707] arm64: Kconfig: clean up tautological LLVM version checks Now that the minimum supported version of LLVM for building the kernel has been bumped to 13.0.1, several conditions become tautologies, as they will always be true because the build will fail during the configuration stage for older LLVM versions. Drop them, as they are unnecessary. Link: https://lkml.kernel.org/r/20240125-bump-min-llvm-ver-to-13-0-1-v1-5-f5ff9bda41c5@kernel.org Signed-off-by: Nathan Chancellor Reviewed-by: Kees Cook Cc: Albert Ou Cc: "Aneesh Kumar K.V (IBM)" Cc: Ard Biesheuvel Cc: Borislav Petkov (AMD) Cc: Catalin Marinas Cc: Conor Dooley Cc: Dave Hansen Cc: Ingo Molnar Cc: Mark Rutland Cc: Masahiro Yamada Cc: Michael Ellerman Cc: "Naveen N. Rao" Cc: Nicholas Piggin Cc: Nicolas Schier Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Russell King Cc: Thomas Gleixner Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/arm64/Kconfig | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 5a8acca4dbf495..cb34e7d780c090 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -383,7 +383,7 @@ config BUILTIN_RETURN_ADDRESS_STRIPS_PAC bool # Clang's __builtin_return_adddress() strips the PAC since 12.0.0 # https://github.com/llvm/llvm-project/commit/2a96f47c5ffca84cd774ad402cacd137f4bf45e2 - default y if CC_IS_CLANG && (CLANG_VERSION >= 120000) + default y if CC_IS_CLANG # GCC's __builtin_return_address() strips the PAC since 11.1.0, # and this was backported to 10.2.0, 9.4.0, 8.5.0, but not earlier # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=94891 @@ -1387,7 +1387,6 @@ choice config CPU_BIG_ENDIAN bool "Build big-endian kernel" - depends on !LD_IS_LLD || LLD_VERSION >= 130000 # https://github.com/llvm/llvm-project/commit/1379b150991f70a5782e9a143c2ba5308da1161c depends on AS_IS_GNU || AS_VERSION >= 150000 help @@ -2018,8 +2017,6 @@ config ARM64_BTI_KERNEL depends on !CC_IS_GCC || GCC_VERSION >= 100100 # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106671 depends on !CC_IS_GCC - # https://github.com/llvm/llvm-project/commit/a88c722e687e6780dcd6a58718350dc76fcc4cc9 - depends on !CC_IS_CLANG || CLANG_VERSION >= 120000 depends on (!FUNCTION_GRAPH_TRACER || DYNAMIC_FTRACE_WITH_ARGS) help Build the kernel with Branch Target Identification annotations From c9be957f1135c039151f2483cfa5924f3cad53ff Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Thu, 25 Jan 2024 15:55:12 -0700 Subject: [PATCH 609/707] powerpc: Kconfig: remove tautology in CONFIG_COMPAT This reverts commit 6fcb574125e6 ("powerpc: Kconfig: disable CONFIG_COMPAT for clang < 12"). Now that the minimum supported version of LLVM for building the kernel has been bumped to 13.0.1, this condition is always true, as the build will fail during the configuration stage for older LLVM versions. Remove it. Link: https://lkml.kernel.org/r/20240125-bump-min-llvm-ver-to-13-0-1-v1-6-f5ff9bda41c5@kernel.org Signed-off-by: Nathan Chancellor Reviewed-by: Kees Cook Cc: Albert Ou Cc: "Aneesh Kumar K.V (IBM)" Cc: Ard Biesheuvel Cc: Borislav Petkov (AMD) Cc: Catalin Marinas Cc: Conor Dooley Cc: Dave Hansen Cc: Ingo Molnar Cc: Mark Rutland Cc: Masahiro Yamada Cc: Michael Ellerman Cc: "Naveen N. Rao" Cc: Nicholas Piggin Cc: Nicolas Schier Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Russell King Cc: Thomas Gleixner Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/powerpc/Kconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index b9fc064d38d281..86da0d01365a75 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -333,7 +333,6 @@ config PANIC_TIMEOUT config COMPAT bool "Enable support for 32bit binaries" depends on PPC64 - depends on !CC_IS_CLANG || CLANG_VERSION >= 120000 default y if !CPU_LITTLE_ENDIAN select ARCH_WANT_OLD_COMPAT_IPC select COMPAT_OLD_SIGACTION From 944e827678dd77e8dbc89d9fe7111aec830757f3 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Thu, 25 Jan 2024 15:55:13 -0700 Subject: [PATCH 610/707] riscv: remove MCOUNT_NAME workaround Now that the minimum supported version of LLVM for building the kernel has been bumped to 13.0.1, the condition for using _mcount as MCOUNT_NAME is always true, as the build will fail during the configuration stage for older LLVM versions. Replace MCOUNT_NAME with _mcount directly. This effectively reverts commit 7ce047715030 ("riscv: Workaround mcount name prior to clang-13"). Link: https://lkml.kernel.org/r/20240125-bump-min-llvm-ver-to-13-0-1-v1-7-f5ff9bda41c5@kernel.org Signed-off-by: Nathan Chancellor Reviewed-by: Kees Cook Cc: Albert Ou Cc: "Aneesh Kumar K.V (IBM)" Cc: Ard Biesheuvel Cc: Borislav Petkov (AMD) Cc: Catalin Marinas Cc: Conor Dooley Cc: Dave Hansen Cc: Ingo Molnar Cc: Mark Rutland Cc: Masahiro Yamada Cc: Michael Ellerman Cc: "Naveen N. Rao" Cc: Nicholas Piggin Cc: Nicolas Schier Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Russell King Cc: Thomas Gleixner Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/riscv/include/asm/ftrace.h | 14 ++------------ arch/riscv/kernel/mcount.S | 10 +++++----- scripts/recordmcount.pl | 2 +- 3 files changed, 8 insertions(+), 18 deletions(-) diff --git a/arch/riscv/include/asm/ftrace.h b/arch/riscv/include/asm/ftrace.h index 06874fb1311e5e..cf5b63e789fa7c 100644 --- a/arch/riscv/include/asm/ftrace.h +++ b/arch/riscv/include/asm/ftrace.h @@ -13,19 +13,9 @@ #endif #define HAVE_FUNCTION_GRAPH_RET_ADDR_PTR -/* - * Clang prior to 13 had "mcount" instead of "_mcount": - * https://github.com/llvm/llvm-project/commit/ef58ae86ba778ed7d01cd3f6bd6d08f943abab44 - */ -#if defined(CONFIG_CC_IS_GCC) || CONFIG_CLANG_VERSION >= 130000 -#define MCOUNT_NAME _mcount -#else -#define MCOUNT_NAME mcount -#endif - #define ARCH_SUPPORTS_FTRACE_OPS 1 #ifndef __ASSEMBLY__ -void MCOUNT_NAME(void); +void _mcount(void); static inline unsigned long ftrace_call_adjust(unsigned long addr) { return addr; @@ -75,7 +65,7 @@ struct dyn_arch_ftrace { * both auipc and jalr at the same time. */ -#define MCOUNT_ADDR ((unsigned long)MCOUNT_NAME) +#define MCOUNT_ADDR ((unsigned long)_mcount) #define JALR_SIGN_MASK (0x00000800) #define JALR_OFFSET_MASK (0x00000fff) #define AUIPC_OFFSET_MASK (0xfffff000) diff --git a/arch/riscv/kernel/mcount.S b/arch/riscv/kernel/mcount.S index d7ec69ac6910c6..3a42f6287909d0 100644 --- a/arch/riscv/kernel/mcount.S +++ b/arch/riscv/kernel/mcount.S @@ -50,8 +50,8 @@ SYM_TYPED_FUNC_START(ftrace_stub) #ifdef CONFIG_DYNAMIC_FTRACE - .global MCOUNT_NAME - .set MCOUNT_NAME, ftrace_stub + .global _mcount + .set _mcount, ftrace_stub #endif ret SYM_FUNC_END(ftrace_stub) @@ -80,7 +80,7 @@ SYM_FUNC_END(return_to_handler) #endif #ifndef CONFIG_DYNAMIC_FTRACE -SYM_FUNC_START(MCOUNT_NAME) +SYM_FUNC_START(_mcount) la t4, ftrace_stub #ifdef CONFIG_FUNCTION_GRAPH_TRACER la t0, ftrace_graph_return @@ -126,6 +126,6 @@ SYM_FUNC_START(MCOUNT_NAME) jalr t5 RESTORE_ABI_STATE ret -SYM_FUNC_END(MCOUNT_NAME) +SYM_FUNC_END(_mcount) #endif -EXPORT_SYMBOL(MCOUNT_NAME) +EXPORT_SYMBOL(_mcount) diff --git a/scripts/recordmcount.pl b/scripts/recordmcount.pl index f84df9e383fd0a..0871b2e92584b2 100755 --- a/scripts/recordmcount.pl +++ b/scripts/recordmcount.pl @@ -352,7 +352,7 @@ $mcount_regex = "^\\s*([0-9a-fA-F]+):.*\\s_mcount\$"; } elsif ($arch eq "riscv") { $function_regex = "^([0-9a-fA-F]+)\\s+<([^.0-9][0-9a-zA-Z_\\.]+)>:"; - $mcount_regex = "^\\s*([0-9a-fA-F]+):\\sR_RISCV_CALL(_PLT)?\\s_?mcount\$"; + $mcount_regex = "^\\s*([0-9a-fA-F]+):\\sR_RISCV_CALL(_PLT)?\\s_mcount\$"; $type = ".quad"; $alignment = 2; } elsif ($arch eq "csky") { From e129633ac04f66c67b6c4edaa035bd0c811228bf Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Thu, 25 Jan 2024 15:55:14 -0700 Subject: [PATCH 611/707] riscv: Kconfig: remove version dependency from CONFIG_CLANG_SUPPORTS_DYNAMIC_FTRACE Now that the minimum supported version of LLVM for building the kernel has been bumped to 13.0.1, this condition is always true, as the build will fail during the configuration stage for older LLVM versions. Remove it. Link: https://lkml.kernel.org/r/20240125-bump-min-llvm-ver-to-13-0-1-v1-8-f5ff9bda41c5@kernel.org Signed-off-by: Nathan Chancellor Reviewed-by: Kees Cook Cc: Albert Ou Cc: "Aneesh Kumar K.V (IBM)" Cc: Ard Biesheuvel Cc: Borislav Petkov (AMD) Cc: Catalin Marinas Cc: Conor Dooley Cc: Dave Hansen Cc: Ingo Molnar Cc: Mark Rutland Cc: Masahiro Yamada Cc: Michael Ellerman Cc: "Naveen N. Rao" Cc: Nicholas Piggin Cc: Nicolas Schier Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Russell King Cc: Thomas Gleixner Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/riscv/Kconfig | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 69d24f51392206..00edc4ff589c99 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -174,8 +174,6 @@ config RISCV config CLANG_SUPPORTS_DYNAMIC_FTRACE def_bool CC_IS_CLANG - # https://github.com/llvm/llvm-project/commit/6ab8927931851bb42b2c93a00801dc499d7d9b1e - depends on CLANG_VERSION >= 130000 # https://github.com/ClangBuiltLinux/linux/issues/1817 depends on AS_IS_GNU || (AS_IS_LLVM && (LD_IS_LLD || LD_VERSION >= 23600)) From 2bc5ecf4daa5d64c5246c4e1176d0814b7a07e1f Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Thu, 25 Jan 2024 15:55:15 -0700 Subject: [PATCH 612/707] fortify: drop Clang version check for 12.0.1 or newer Now that the minimum supported version of LLVM for building the kernel has been bumped to 13.0.1, this condition is always true, as the build will fail during the configuration stage for older LLVM versions. Remove it. Link: https://lkml.kernel.org/r/20240125-bump-min-llvm-ver-to-13-0-1-v1-9-f5ff9bda41c5@kernel.org Signed-off-by: Nathan Chancellor Reviewed-by: Kees Cook Cc: Albert Ou Cc: "Aneesh Kumar K.V (IBM)" Cc: Ard Biesheuvel Cc: Borislav Petkov (AMD) Cc: Catalin Marinas Cc: Conor Dooley Cc: Dave Hansen Cc: Ingo Molnar Cc: Mark Rutland Cc: Masahiro Yamada Cc: Michael Ellerman Cc: "Naveen N. Rao" Cc: Nicholas Piggin Cc: Nicolas Schier Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Russell King Cc: Thomas Gleixner Cc: Will Deacon Signed-off-by: Andrew Morton --- security/Kconfig | 2 -- 1 file changed, 2 deletions(-) diff --git a/security/Kconfig b/security/Kconfig index 606a87c29a0170..412e76f1575d0d 100644 --- a/security/Kconfig +++ b/security/Kconfig @@ -142,8 +142,6 @@ config HARDENED_USERCOPY config FORTIFY_SOURCE bool "Harden common str/mem functions against buffer overflows" depends on ARCH_HAS_FORTIFY_SOURCE - # https://llvm.org/pr41459 - depends on !CC_IS_CLANG || CLANG_VERSION >= 120001 # https://github.com/llvm/llvm-project/issues/53645 depends on !CC_IS_CLANG || !X86_32 help From 1717502a14afb53ac56294661a5426299c94107e Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Thu, 25 Jan 2024 15:55:16 -0700 Subject: [PATCH 613/707] lib/Kconfig.debug: update Clang version check in CONFIG_KCOV Now that the minimum supported version of LLVM for building the kernel has been bumped to 13.0.1, this condition can be changed to just CONFIG_CC_IS_CLANG, as the build will fail during the configuration stage for older LLVM versions. Link: https://lkml.kernel.org/r/20240125-bump-min-llvm-ver-to-13-0-1-v1-10-f5ff9bda41c5@kernel.org Signed-off-by: Nathan Chancellor Reviewed-by: Kees Cook Cc: Albert Ou Cc: "Aneesh Kumar K.V (IBM)" Cc: Ard Biesheuvel Cc: Borislav Petkov (AMD) Cc: Catalin Marinas Cc: Conor Dooley Cc: Dave Hansen Cc: Ingo Molnar Cc: Mark Rutland Cc: Masahiro Yamada Cc: Michael Ellerman Cc: "Naveen N. Rao" Cc: Nicholas Piggin Cc: Nicolas Schier Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Russell King Cc: Thomas Gleixner Cc: Will Deacon Signed-off-by: Andrew Morton --- lib/Kconfig.debug | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 8f502f15dc7fb5..1339fb893d712f 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -2085,7 +2085,7 @@ config KCOV depends on ARCH_HAS_KCOV depends on CC_HAS_SANCOV_TRACE_PC || GCC_PLUGINS depends on !ARCH_WANTS_NO_INSTR || HAVE_NOINSTR_HACK || \ - GCC_VERSION >= 120000 || CLANG_VERSION >= 130000 + GCC_VERSION >= 120000 || CC_IS_CLANG select DEBUG_FS select GCC_PLUGIN_SANCOV if !CC_HAS_SANCOV_TRACE_PC select OBJTOOL if HAVE_NOINSTR_HACK From 94b190dbda3d16850a002c372fc1040541ce124e Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Thu, 25 Jan 2024 15:55:17 -0700 Subject: [PATCH 614/707] compiler-clang.h: update __diag_clang() macros for minimum version bump The minimum supported version of LLVM for building the kernel has been bumped to 13.0.1. Update the __diag_clang() macros for this bump. Link: https://lkml.kernel.org/r/20240125-bump-min-llvm-ver-to-13-0-1-v1-11-f5ff9bda41c5@kernel.org Signed-off-by: Nathan Chancellor Reviewed-by: Kees Cook Cc: Albert Ou Cc: "Aneesh Kumar K.V (IBM)" Cc: Ard Biesheuvel Cc: Borislav Petkov (AMD) Cc: Catalin Marinas Cc: Conor Dooley Cc: Dave Hansen Cc: Ingo Molnar Cc: Mark Rutland Cc: Masahiro Yamada Cc: Michael Ellerman Cc: "Naveen N. Rao" Cc: Nicholas Piggin Cc: Nicolas Schier Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Russell King Cc: Thomas Gleixner Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/compiler-clang.h | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h index f0a47afef12581..49feac0162a526 100644 --- a/include/linux/compiler-clang.h +++ b/include/linux/compiler-clang.h @@ -114,11 +114,7 @@ #define __diag_str(s) __diag_str1(s) #define __diag(s) _Pragma(__diag_str(clang diagnostic s)) -#if CONFIG_CLANG_VERSION >= 110000 -#define __diag_clang_11(s) __diag(s) -#else -#define __diag_clang_11(s) -#endif +#define __diag_clang_13(s) __diag(s) #define __diag_ignore_all(option, comment) \ - __diag_clang(11, ignore, option) + __diag_clang(13, ignore, option) From aa6a58c8fcaf82e75ab200958605139e6ff93299 Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Thu, 25 Jan 2024 20:46:04 +0500 Subject: [PATCH 615/707] selftests/mm: hugetlb_reparenting_test: do not unmount Patch series "selftests/mm: Improve run_vmtests.sh", v3. In this series, I'm trying to add 3 missing tests to vm_runtests.sh which is used to run all the tests in mm suite. These tests weren't running by CIs. While enabling them and through review feedback, I've fixed some problems in tests as well. I've found more flakiness in more tests which I'll be fixing with future patches. hugetlb-read-hwpoison test is being added where it can only run with newly added "-d" (destructive) flag only. Not sure why it is failing again. So once it become stable, we can think of moving it to default set of tests if it doesn't have any side-effect to them. This patch (of 5): Do not unmount the cgroup if it wasn't mounted by the test. The earlier patch had fixed this for charge_reserved_hugetlb, but not for this test. I'm adding fixes tag to that earlier patch. Link: https://lkml.kernel.org/r/20240125154608.720072-1-usama.anjum@collabora.com Link: https://lkml.kernel.org/r/20240125154608.720072-2-usama.anjum@collabora.com Fixes: 209376ed2a84 ("selftests/vm: make charge_reserved_hugetlb.sh work with existing cgroup setting") Signed-off-by: Muhammad Usama Anjum Cc: Ryan Roberts Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/hugetlb_reparenting_test.sh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/mm/hugetlb_reparenting_test.sh b/tools/testing/selftests/mm/hugetlb_reparenting_test.sh index 14d26075c8635f..615c4d766c9093 100755 --- a/tools/testing/selftests/mm/hugetlb_reparenting_test.sh +++ b/tools/testing/selftests/mm/hugetlb_reparenting_test.sh @@ -248,5 +248,7 @@ cleanup echo ALL PASS -umount $CGROUP_ROOT -rm -rf $CGROUP_ROOT +if [[ $do_umount ]]; then + umount $CGROUP_ROOT + rm -rf $CGROUP_ROOT +fi From 25ea45f9058026e644a2c9eaf494aef789576bdf Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Thu, 25 Jan 2024 20:46:05 +0500 Subject: [PATCH 616/707] selftests/mm: run_vmtests: remove sudo and conform to tap Remove sudo as some test running environments may not have sudo available. Instead skip the test if root privileges aren't available in the test. Link: https://lkml.kernel.org/r/20240125154608.720072-3-usama.anjum@collabora.com Signed-off-by: Muhammad Usama Anjum Cc: Ryan Roberts Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/on-fault-limit.c | 36 ++++++++++----------- tools/testing/selftests/mm/run_vmtests.sh | 2 +- 2 files changed, 18 insertions(+), 20 deletions(-) diff --git a/tools/testing/selftests/mm/on-fault-limit.c b/tools/testing/selftests/mm/on-fault-limit.c index b5888d613f34eb..0ea98ffab35892 100644 --- a/tools/testing/selftests/mm/on-fault-limit.c +++ b/tools/testing/selftests/mm/on-fault-limit.c @@ -5,40 +5,38 @@ #include #include #include +#include "../kselftest.h" -static int test_limit(void) +static void test_limit(void) { - int ret = 1; struct rlimit lims; void *map; - if (getrlimit(RLIMIT_MEMLOCK, &lims)) { - perror("getrlimit"); - return ret; - } + if (getrlimit(RLIMIT_MEMLOCK, &lims)) + ksft_exit_fail_msg("getrlimit: %s\n", strerror(errno)); - if (mlockall(MCL_ONFAULT | MCL_FUTURE)) { - perror("mlockall"); - return ret; - } + if (mlockall(MCL_ONFAULT | MCL_FUTURE)) + ksft_exit_fail_msg("mlockall: %s\n", strerror(errno)); map = mmap(NULL, 2 * lims.rlim_max, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_POPULATE, -1, 0); + + ksft_test_result(map == MAP_FAILED, "Failed mmap\n"); + if (map != MAP_FAILED) - printf("mmap should have failed, but didn't\n"); - else { - ret = 0; munmap(map, 2 * lims.rlim_max); - } - munlockall(); - return ret; } int main(int argc, char **argv) { - int ret = 0; + ksft_print_header(); + ksft_set_plan(1); + + if (getuid()) + ksft_test_result_skip("Require root privileges to run\n"); + else + test_limit(); - ret += test_limit(); - return ret; + ksft_finished(); } diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh index 246d53a5d7f287..e373d592dbf5cb 100755 --- a/tools/testing/selftests/mm/run_vmtests.sh +++ b/tools/testing/selftests/mm/run_vmtests.sh @@ -291,7 +291,7 @@ echo "$nr_hugepgs" > /proc/sys/vm/nr_hugepages CATEGORY="compaction" run_test ./compaction_test -CATEGORY="mlock" run_test sudo -u nobody ./on-fault-limit +CATEGORY="mlock" run_test ./on-fault-limit CATEGORY="mmap" run_test ./map_populate From 4fc1e7ffb61992bd1f2460214c766beb8c4a47c8 Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Thu, 25 Jan 2024 20:46:06 +0500 Subject: [PATCH 617/707] selftests/mm: save and restore nr_hugepages value Save and restore nr_hugepages before changing it during the test. A test should not change system wide settings. Link: https://lkml.kernel.org/r/20240125154608.720072-4-usama.anjum@collabora.com Signed-off-by: Muhammad Usama Anjum Cc: Ryan Roberts Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/charge_reserved_hugetlb.sh | 4 ++++ tools/testing/selftests/mm/hugetlb_reparenting_test.sh | 3 +++ 2 files changed, 7 insertions(+) diff --git a/tools/testing/selftests/mm/charge_reserved_hugetlb.sh b/tools/testing/selftests/mm/charge_reserved_hugetlb.sh index 0899019a7fcb4b..16b3c2e428dbdb 100755 --- a/tools/testing/selftests/mm/charge_reserved_hugetlb.sh +++ b/tools/testing/selftests/mm/charge_reserved_hugetlb.sh @@ -11,6 +11,8 @@ if [[ $(id -u) -ne 0 ]]; then exit $ksft_skip fi +nr_hugepgs=$(cat /proc/sys/vm/nr_hugepages) + fault_limit_file=limit_in_bytes reservation_limit_file=rsvd.limit_in_bytes fault_usage_file=usage_in_bytes @@ -582,3 +584,5 @@ if [[ $do_umount ]]; then umount $cgroup_path rmdir $cgroup_path fi + +echo "$nr_hugepgs" > /proc/sys/vm/nr_hugepages diff --git a/tools/testing/selftests/mm/hugetlb_reparenting_test.sh b/tools/testing/selftests/mm/hugetlb_reparenting_test.sh index 615c4d766c9093..11f9bbe7dc222b 100755 --- a/tools/testing/selftests/mm/hugetlb_reparenting_test.sh +++ b/tools/testing/selftests/mm/hugetlb_reparenting_test.sh @@ -11,6 +11,7 @@ if [[ $(id -u) -ne 0 ]]; then exit $ksft_skip fi +nr_hugepgs=$(cat /proc/sys/vm/nr_hugepages) usage_file=usage_in_bytes if [[ "$1" == "-cgroup-v2" ]]; then @@ -252,3 +253,5 @@ if [[ $do_umount ]]; then umount $CGROUP_ROOT rm -rf $CGROUP_ROOT fi + +echo "$nr_hugepgs" > /proc/sys/vm/nr_hugepages From 1fdeab609de0c57b9ef56078ddf647d36089147d Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Thu, 25 Jan 2024 20:46:07 +0500 Subject: [PATCH 618/707] selftests/mm: protection_keys: save/restore nr_hugepages settings Save and restore nr_hugepages before changing it during the test. A test should not change system wide settings. Link: https://lkml.kernel.org/r/20240125154608.720072-5-usama.anjum@collabora.com Fixes: 5f23f6d082a9 ("x86/pkeys: Add self-tests") Signed-off-by: Muhammad Usama Anjum Cc: Ryan Roberts Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/protection_keys.c | 34 ++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/tools/testing/selftests/mm/protection_keys.c b/tools/testing/selftests/mm/protection_keys.c index 48dc151f8fca8a..f822ae31af22e2 100644 --- a/tools/testing/selftests/mm/protection_keys.c +++ b/tools/testing/selftests/mm/protection_keys.c @@ -54,6 +54,7 @@ int test_nr; u64 shadow_pkey_reg; int dprint_in_signal; char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE]; +char buf[256]; void cat_into_file(char *str, char *file) { @@ -1744,6 +1745,38 @@ void pkey_setup_shadow(void) shadow_pkey_reg = __read_pkey_reg(); } +void restore_settings_atexit(void) +{ + cat_into_file(buf, "/proc/sys/vm/nr_hugepages"); +} + +void save_settings(void) +{ + int fd; + int err; + + if (geteuid()) + return; + + fd = open("/proc/sys/vm/nr_hugepages", O_RDONLY); + if (fd < 0) { + fprintf(stderr, "error opening\n"); + perror("error: "); + exit(__LINE__); + } + + /* -1 to guarantee leaving the trailing \0 */ + err = read(fd, buf, sizeof(buf)-1); + if (err < 0) { + fprintf(stderr, "error reading\n"); + perror("error: "); + exit(__LINE__); + } + + atexit(restore_settings_atexit); + close(fd); +} + int main(void) { int nr_iterations = 22; @@ -1751,6 +1784,7 @@ int main(void) srand((unsigned int)time(NULL)); + save_settings(); setup_handlers(); printf("has pkeys: %d\n", pkeys_supported); From f7ff0b277ff9fefe8f41b374baca8888c7dc54cd Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Thu, 25 Jan 2024 20:46:08 +0500 Subject: [PATCH 619/707] selftests/mm: run_vmtests.sh: add missing tests Add missing tests to run_vmtests.sh. The mm kselftests are run through run_vmtests.sh. If a test isn't present in this script, it'll not run with run_tests or `make -C tools/testing/selftests/mm run_tests`. Link: https://lkml.kernel.org/r/20240125154608.720072-6-usama.anjum@collabora.com Cc: Ryan Roberts Signed-off-by: Muhammad Usama Anjum Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/Makefile | 5 +++++ tools/testing/selftests/mm/run_vmtests.sh | 8 ++++++++ 2 files changed, 13 insertions(+) diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile index 2453add65d12f8..f3aec7be80730b 100644 --- a/tools/testing/selftests/mm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@ -114,6 +114,11 @@ TEST_PROGS := run_vmtests.sh TEST_FILES := test_vmalloc.sh TEST_FILES += test_hmm.sh TEST_FILES += va_high_addr_switch.sh +TEST_FILES += charge_reserved_hugetlb.sh +TEST_FILES += hugetlb_reparenting_test.sh + +# required by charge_reserved_hugetlb.sh +TEST_FILES += write_hugetlb_memory.sh include ../lib.mk diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh index e373d592dbf5cb..a0f37e44389372 100755 --- a/tools/testing/selftests/mm/run_vmtests.sh +++ b/tools/testing/selftests/mm/run_vmtests.sh @@ -19,6 +19,7 @@ usage: ${BASH_SOURCE[0]:-$0} [ options ] -t: specify specific categories to tests to run -h: display this message -n: disable TAP output + -d: run destructive tests The default behavior is to run required tests only. If -a is specified, will run all tests. @@ -79,6 +80,7 @@ EOF } RUN_ALL=false +RUN_DESTRUCTIVE_TEST=false TAP_PREFIX="# " while getopts "aht:n" OPT; do @@ -87,6 +89,7 @@ while getopts "aht:n" OPT; do "h") usage ;; "t") VM_SELFTEST_ITEMS=${OPTARG} ;; "n") TAP_PREFIX= ;; + "a") RUN_DESTRUCTIVE_TEST=true ;; esac done shift $((OPTIND -1)) @@ -304,6 +307,11 @@ CATEGORY="process_mrelease" run_test ./mrelease_test CATEGORY="mremap" run_test ./mremap_test CATEGORY="hugetlb" run_test ./thuge-gen +CATEGORY="hugetlb" run_test ./charge_reserved_hugetlb.sh -cgroup-v2 +CATEGORY="hugetlb" run_test ./hugetlb_reparenting_test.sh -cgroup-v2 +if $RUN_DESTRUCTIVE_TEST; then +CATEGORY="hugetlb" run_test ./hugetlb-read-hwpoison +fi if [ $VADDR64 -ne 0 ]; then From 6d5cbfaf058cecf52dea4839cb0681207b908c03 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 24 Jan 2024 15:27:35 +0100 Subject: [PATCH 620/707] init: remove obsolete arch_call_rest_init() wrapper Since commit 3570ee046c46b5dc ("s390/smp: keep the original lowcore for CPU 0"), there is no longer any architecture that needs to override arch_call_rest_init(). Remove the weak wrapper around rest_init(), call rest_init() directly, and make rest_init() static. Link: https://lkml.kernel.org/r/aa10868bfb176eef4abb8bb4a710b85330792694.1706106183.git.geert@linux-m68k.org Signed-off-by: Geert Uytterhoeven Cc: Arnd Bergmann Cc: Ilya Leoshkevich Cc: Josh Poimboeuf Cc: Peter Zijlstra Signed-off-by: Andrew Morton --- include/linux/start_kernel.h | 2 -- init/main.c | 9 ++------- tools/objtool/noreturns.h | 1 - 3 files changed, 2 insertions(+), 10 deletions(-) diff --git a/include/linux/start_kernel.h b/include/linux/start_kernel.h index a9806a44a605c7..09f994ac87df44 100644 --- a/include/linux/start_kernel.h +++ b/include/linux/start_kernel.h @@ -9,7 +9,5 @@ up something else. */ extern asmlinkage void __init __noreturn start_kernel(void); -extern void __init __noreturn arch_call_rest_init(void); -extern void __ref __noreturn rest_init(void); #endif /* _LINUX_START_KERNEL_H */ diff --git a/init/main.c b/init/main.c index e24b0780fdff7a..521f40770e67dd 100644 --- a/init/main.c +++ b/init/main.c @@ -681,7 +681,7 @@ static void __init setup_command_line(char *command_line) static __initdata DECLARE_COMPLETION(kthreadd_done); -noinline void __ref __noreturn rest_init(void) +static noinline void __ref __noreturn rest_init(void) { struct task_struct *tsk; int pid; @@ -822,11 +822,6 @@ static int __init early_randomize_kstack_offset(char *buf) early_param("randomize_kstack_offset", early_randomize_kstack_offset); #endif -void __init __weak __noreturn arch_call_rest_init(void) -{ - rest_init(); -} - static void __init print_unknown_bootoptions(void) { char *unknown_options; @@ -1069,7 +1064,7 @@ void start_kernel(void) kcsan_init(); /* Do the rest non-__init'ed, we're now alive */ - arch_call_rest_init(); + rest_init(); /* * Avoid stack canaries in callers of boot_init_stack_canary for gcc-10 diff --git a/tools/objtool/noreturns.h b/tools/objtool/noreturns.h index 1685d7ea6a9f70..7cda577da897ca 100644 --- a/tools/objtool/noreturns.h +++ b/tools/objtool/noreturns.h @@ -12,7 +12,6 @@ NORETURN(__reiserfs_panic) NORETURN(__stack_chk_fail) NORETURN(__tdx_hypercall_failed) NORETURN(__ubsan_handle_builtin_unreachable) -NORETURN(arch_call_rest_init) NORETURN(arch_cpu_idle_dead) NORETURN(bch2_trans_in_restart_error) NORETURN(bch2_trans_restart_error) From 4d3e70d4f83aa98d4e50fd163d44872e25e31479 Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Fri, 26 Jan 2024 16:21:20 +0500 Subject: [PATCH 621/707] selftests/mm: hugepage-shm: conform test to TAP format output Conform the layout, informational and status messages to TAP. No functional change is intended other than the layout of output messages. The "." was being printed inside for loop to indicate the writes progress. This was extraneous and hence removed in the patch. Link: https://lkml.kernel.org/r/20240126112129.1480265-1-usama.anjum@collabora.com Signed-off-by: Muhammad Usama Anjum Cc: Shuah Khan Cc: Jiaqi Yan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/hugepage-shm.c | 47 +++++++++++------------ 1 file changed, 22 insertions(+), 25 deletions(-) diff --git a/tools/testing/selftests/mm/hugepage-shm.c b/tools/testing/selftests/mm/hugepage-shm.c index 478bb1e989e9f3..f949dbbc345409 100644 --- a/tools/testing/selftests/mm/hugepage-shm.c +++ b/tools/testing/selftests/mm/hugepage-shm.c @@ -34,11 +34,10 @@ #include #include #include +#include "../kselftest.h" #define LENGTH (256UL*1024*1024) -#define dprintf(x) printf(x) - /* Only ia64 requires this */ #ifdef __ia64__ #define ADDR (void *)(0x8000000000000000UL) @@ -54,44 +53,42 @@ int main(void) unsigned long i; char *shmaddr; + ksft_print_header(); + ksft_set_plan(1); + shmid = shmget(2, LENGTH, SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W); - if (shmid < 0) { - perror("shmget"); - exit(1); - } - printf("shmid: 0x%x\n", shmid); + if (shmid < 0) + ksft_exit_fail_msg("shmget: %s\n", strerror(errno)); + + ksft_print_msg("shmid: 0x%x\n", shmid); shmaddr = shmat(shmid, ADDR, SHMAT_FLAGS); if (shmaddr == (char *)-1) { - perror("Shared memory attach failure"); shmctl(shmid, IPC_RMID, NULL); - exit(2); + ksft_exit_fail_msg("Shared memory attach failure: %s\n", strerror(errno)); } - printf("shmaddr: %p\n", shmaddr); - dprintf("Starting the writes:\n"); - for (i = 0; i < LENGTH; i++) { + ksft_print_msg("shmaddr: %p\n", shmaddr); + + ksft_print_msg("Starting the writes:"); + for (i = 0; i < LENGTH; i++) shmaddr[i] = (char)(i); - if (!(i % (1024 * 1024))) - dprintf("."); - } - dprintf("\n"); + ksft_print_msg("Done.\n"); - dprintf("Starting the Check..."); + ksft_print_msg("Starting the Check..."); for (i = 0; i < LENGTH; i++) - if (shmaddr[i] != (char)i) { - printf("\nIndex %lu mismatched\n", i); - exit(3); - } - dprintf("Done.\n"); + if (shmaddr[i] != (char)i) + ksft_exit_fail_msg("\nIndex %lu mismatched\n", i); + ksft_print_msg("Done.\n"); if (shmdt((const void *)shmaddr) != 0) { - perror("Detach failure"); shmctl(shmid, IPC_RMID, NULL); - exit(4); + ksft_exit_fail_msg("Detach failure: %s\n", strerror(errno)); } shmctl(shmid, IPC_RMID, NULL); - return 0; + ksft_test_result_pass("Completed test\n"); + + ksft_finished(); } From 216bd78b2e3198c7a92e0ba0b00df658cc3b9220 Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Fri, 26 Jan 2024 16:21:21 +0500 Subject: [PATCH 622/707] selftests/mm: hugepage-vmemmap: conform test to TAP format output Conform the layout, informational and status messages to TAP. No functional change is intended other than the layout of output messages. Link: https://lkml.kernel.org/r/20240126112129.1480265-2-usama.anjum@collabora.com Signed-off-by: Muhammad Usama Anjum Cc: Jiaqi Yan Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/hugepage-vmemmap.c | 39 ++++++++----------- 1 file changed, 16 insertions(+), 23 deletions(-) diff --git a/tools/testing/selftests/mm/hugepage-vmemmap.c b/tools/testing/selftests/mm/hugepage-vmemmap.c index 894d28c3dd4785..9080e34bb297e5 100644 --- a/tools/testing/selftests/mm/hugepage-vmemmap.c +++ b/tools/testing/selftests/mm/hugepage-vmemmap.c @@ -11,6 +11,7 @@ #include #include #include "vm_util.h" +#include "../kselftest.h" #define PAGE_COMPOUND_HEAD (1UL << 15) #define PAGE_COMPOUND_TAIL (1UL << 16) @@ -77,7 +78,7 @@ static int check_page_flags(unsigned long pfn) read(fd, &pageflags, sizeof(pageflags)); if ((pageflags & HEAD_PAGE_FLAGS) != HEAD_PAGE_FLAGS) { close(fd); - printf("Head page flags (%lx) is invalid\n", pageflags); + ksft_print_msg("Head page flags (%lx) is invalid\n", pageflags); return -1; } @@ -91,7 +92,7 @@ static int check_page_flags(unsigned long pfn) if ((pageflags & TAIL_PAGE_FLAGS) != TAIL_PAGE_FLAGS || (pageflags & HEAD_PAGE_FLAGS) == HEAD_PAGE_FLAGS) { close(fd); - printf("Tail page flags (%lx) is invalid\n", pageflags); + ksft_print_msg("Tail page flags (%lx) is invalid\n", pageflags); return -1; } } @@ -106,18 +107,17 @@ int main(int argc, char **argv) void *addr; unsigned long pfn; + ksft_print_header(); + ksft_set_plan(1); + pagesize = psize(); maplength = default_huge_page_size(); - if (!maplength) { - printf("Unable to determine huge page size\n"); - exit(1); - } + if (!maplength) + ksft_exit_fail_msg("Unable to determine huge page size\n"); addr = mmap(MAP_ADDR, maplength, PROT_READ | PROT_WRITE, MAP_FLAGS, -1, 0); - if (addr == MAP_FAILED) { - perror("mmap"); - exit(1); - } + if (addr == MAP_FAILED) + ksft_exit_fail_msg("mmap: %s\n", strerror(errno)); /* Trigger allocation of HugeTLB page. */ write_bytes(addr, maplength); @@ -125,23 +125,16 @@ int main(int argc, char **argv) pfn = virt_to_pfn(addr); if (pfn == -1UL) { munmap(addr, maplength); - perror("virt_to_pfn"); - exit(1); + ksft_exit_fail_msg("virt_to_pfn: %s\n", strerror(errno)); } - printf("Returned address is %p whose pfn is %lx\n", addr, pfn); + ksft_print_msg("Returned address is %p whose pfn is %lx\n", addr, pfn); - if (check_page_flags(pfn) < 0) { - munmap(addr, maplength); - perror("check_page_flags"); - exit(1); - } + ksft_test_result(!check_page_flags(pfn), "check_page_flags\n"); /* munmap() length of MAP_HUGETLB memory must be hugepage aligned */ - if (munmap(addr, maplength)) { - perror("munmap"); - exit(1); - } + if (munmap(addr, maplength)) + ksft_exit_fail_msg("munmap: %s\n", strerror(errno)); - return 0; + ksft_finished(); } From 7daa749b593b06623cd963569b0cbe91e6a346d3 Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Fri, 26 Jan 2024 16:21:22 +0500 Subject: [PATCH 623/707] selftests/mm: hugetlb-madvise: conform test to TAP format output Conform the layout, informational and status messages to TAP. No functional change is intended other than the layout of output messages. Link: https://lkml.kernel.org/r/20240126112129.1480265-3-usama.anjum@collabora.com Signed-off-by: Muhammad Usama Anjum Cc: Jiaqi Yan Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/hugetlb-madvise.c | 207 +++++++------------ 1 file changed, 80 insertions(+), 127 deletions(-) diff --git a/tools/testing/selftests/mm/hugetlb-madvise.c b/tools/testing/selftests/mm/hugetlb-madvise.c index f32d99565c5eaa..2d9b27bc0d01d1 100644 --- a/tools/testing/selftests/mm/hugetlb-madvise.c +++ b/tools/testing/selftests/mm/hugetlb-madvise.c @@ -19,19 +19,14 @@ #include #include #include "vm_util.h" +#include "../kselftest.h" #define MIN_FREE_PAGES 20 #define NR_HUGE_PAGES 10 /* common number of pages to map/allocate */ #define validate_free_pages(exp_free) \ - do { \ - int fhp = get_free_hugepages(); \ - if (fhp != (exp_free)) { \ - printf("Unexpected number of free huge " \ - "pages line %d\n", __LINE__); \ - exit(1); \ - } \ - } while (0) + ksft_test_result(get_free_hugepages() == (exp_free), \ + "Validation of free pages (%d)\n", __LINE__) unsigned long huge_page_size; unsigned long base_page_size; @@ -64,28 +59,27 @@ int main(int argc, char **argv) int fd; int ret; + ksft_print_header(); + huge_page_size = default_huge_page_size(); - if (!huge_page_size) { - printf("Unable to determine huge page size, exiting!\n"); - exit(1); - } + if (!huge_page_size) + ksft_exit_fail_msg("Unable to determine huge page size, exiting!\n"); + base_page_size = sysconf(_SC_PAGE_SIZE); - if (!huge_page_size) { - printf("Unable to determine base page size, exiting!\n"); - exit(1); - } + if (!huge_page_size) + ksft_exit_fail_msg("Unable to determine base page size, exiting!\n"); free_hugepages = get_free_hugepages(); if (free_hugepages < MIN_FREE_PAGES) { - printf("Not enough free huge pages to test, exiting!\n"); - exit(1); + ksft_print_msg("Not enough free huge pages to test, exiting!\n"); + ksft_finished(); } fd = memfd_create(argv[0], MFD_HUGETLB); - if (fd < 0) { - perror("memfd_create() failed"); - exit(1); - } + if (fd < 0) + ksft_exit_fail_msg("memfd_create() failed\n"); + + ksft_set_plan(37); /* * Test validity of MADV_DONTNEED addr and length arguments. mmap @@ -97,16 +91,13 @@ int main(int argc, char **argv) PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0); - if (addr == MAP_FAILED) { - perror("mmap"); - exit(1); - } + if (addr == MAP_FAILED) + ksft_exit_fail_msg("mmap: %s\n", strerror(errno)); + if (munmap(addr, huge_page_size) || - munmap(addr + (NR_HUGE_PAGES + 1) * huge_page_size, - huge_page_size)) { - perror("munmap"); - exit(1); - } + munmap(addr + (NR_HUGE_PAGES + 1) * huge_page_size, huge_page_size)) + ksft_exit_fail_msg("munmap: %s\n", strerror(errno)); + addr = addr + huge_page_size; write_fault_pages(addr, NR_HUGE_PAGES); @@ -114,21 +105,13 @@ int main(int argc, char **argv) /* addr before mapping should fail */ ret = madvise(addr - base_page_size, NR_HUGE_PAGES * huge_page_size, - MADV_DONTNEED); - if (!ret) { - printf("Unexpected success of madvise call with invalid addr line %d\n", - __LINE__); - exit(1); - } + MADV_DONTNEED); + ksft_test_result(ret, "The madvise call with invalid address\n"); /* addr + length after mapping should fail */ ret = madvise(addr, (NR_HUGE_PAGES * huge_page_size) + base_page_size, - MADV_DONTNEED); - if (!ret) { - printf("Unexpected success of madvise call with invalid length line %d\n", - __LINE__); - exit(1); - } + MADV_DONTNEED); + ksft_test_result(ret, "The madvise call with invalid address\n"); (void)munmap(addr, NR_HUGE_PAGES * huge_page_size); @@ -139,10 +122,9 @@ int main(int argc, char **argv) PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0); - if (addr == MAP_FAILED) { - perror("mmap"); - exit(1); - } + if (addr == MAP_FAILED) + ksft_exit_fail_msg("mmap: %s\n", strerror(errno)); + write_fault_pages(addr, NR_HUGE_PAGES); validate_free_pages(free_hugepages - NR_HUGE_PAGES); @@ -150,19 +132,12 @@ int main(int argc, char **argv) ret = madvise(addr + base_page_size, NR_HUGE_PAGES * huge_page_size - base_page_size, MADV_DONTNEED); - if (!ret) { - printf("Unexpected success of madvise call with unaligned start address %d\n", - __LINE__); - exit(1); - } + ksft_test_result(ret, "The madvise call with unaligned start address\n"); /* addr + length should be aligned down to huge page size */ - if (madvise(addr, - ((NR_HUGE_PAGES - 1) * huge_page_size) + base_page_size, - MADV_DONTNEED)) { - perror("madvise"); - exit(1); - } + ret = madvise(addr, ((NR_HUGE_PAGES - 1) * huge_page_size) + base_page_size, + MADV_DONTNEED); + ksft_test_result(!ret, "The madvise call with aligned start address\n"); /* should free all but last page in mapping */ validate_free_pages(free_hugepages - 1); @@ -177,17 +152,14 @@ int main(int argc, char **argv) PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0); - if (addr == MAP_FAILED) { - perror("mmap"); - exit(1); - } + if (addr == MAP_FAILED) + ksft_exit_fail_msg("mmap: %s\n", strerror(errno)); + write_fault_pages(addr, NR_HUGE_PAGES); validate_free_pages(free_hugepages - NR_HUGE_PAGES); - if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) { - perror("madvise"); - exit(1); - } + ret = madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED); + ksft_test_result(!ret, "The madvise MADV_DONTNEED on anonymous private mapping\n"); /* should free all pages in mapping */ validate_free_pages(free_hugepages); @@ -197,29 +169,25 @@ int main(int argc, char **argv) /* * Test MADV_DONTNEED on private mapping of hugetlb file */ - if (fallocate(fd, 0, 0, NR_HUGE_PAGES * huge_page_size)) { - perror("fallocate"); - exit(1); - } + if (fallocate(fd, 0, 0, NR_HUGE_PAGES * huge_page_size)) + ksft_exit_fail_msg("fallocate: %s\n", strerror(errno)); + validate_free_pages(free_hugepages - NR_HUGE_PAGES); addr = mmap(NULL, NR_HUGE_PAGES * huge_page_size, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0); - if (addr == MAP_FAILED) { - perror("mmap"); - exit(1); - } + if (addr == MAP_FAILED) + ksft_exit_fail_msg("mmap: %s\n", strerror(errno)); /* read should not consume any pages */ read_fault_pages(addr, NR_HUGE_PAGES); validate_free_pages(free_hugepages - NR_HUGE_PAGES); /* madvise should not free any pages */ - if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) { - perror("madvise"); - exit(1); - } + ret = madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED); + ksft_test_result(!ret, "The madvise MADV_DONTNEED on private mapping of file\n"); + validate_free_pages(free_hugepages - NR_HUGE_PAGES); /* writes should allocate private pages */ @@ -227,10 +195,9 @@ int main(int argc, char **argv) validate_free_pages(free_hugepages - (2 * NR_HUGE_PAGES)); /* madvise should free private pages */ - if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) { - perror("madvise"); - exit(1); - } + ret = madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED); + ksft_test_result(!ret, "The madvise MADV_DONTNEED on private mapping of file\n"); + validate_free_pages(free_hugepages - NR_HUGE_PAGES); /* writes should allocate private pages */ @@ -245,10 +212,9 @@ int main(int argc, char **argv) * implementation. */ if (fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, - 0, NR_HUGE_PAGES * huge_page_size)) { - perror("fallocate"); - exit(1); - } + 0, NR_HUGE_PAGES * huge_page_size)) + ksft_exit_fail_msg("fallocate: %s\n", strerror(errno)); + validate_free_pages(free_hugepages); (void)munmap(addr, NR_HUGE_PAGES * huge_page_size); @@ -256,29 +222,25 @@ int main(int argc, char **argv) /* * Test MADV_DONTNEED on shared mapping of hugetlb file */ - if (fallocate(fd, 0, 0, NR_HUGE_PAGES * huge_page_size)) { - perror("fallocate"); - exit(1); - } + if (fallocate(fd, 0, 0, NR_HUGE_PAGES * huge_page_size)) + ksft_exit_fail_msg("fallocate: %s\n", strerror(errno)); + validate_free_pages(free_hugepages - NR_HUGE_PAGES); addr = mmap(NULL, NR_HUGE_PAGES * huge_page_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); - if (addr == MAP_FAILED) { - perror("mmap"); - exit(1); - } + if (addr == MAP_FAILED) + ksft_exit_fail_msg("mmap: %s\n", strerror(errno)); /* write should not consume any pages */ write_fault_pages(addr, NR_HUGE_PAGES); validate_free_pages(free_hugepages - NR_HUGE_PAGES); /* madvise should not free any pages */ - if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) { - perror("madvise"); - exit(1); - } + ret = madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED); + ksft_test_result(!ret, "The madvise MADV_DONTNEED on shared mapping of file\n"); + validate_free_pages(free_hugepages - NR_HUGE_PAGES); /* @@ -286,29 +248,25 @@ int main(int argc, char **argv) * * madvise is same as hole punch and should free all pages. */ - if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_REMOVE)) { - perror("madvise"); - exit(1); - } + ret = madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_REMOVE); + ksft_test_result(!ret, "The madvise MADV_REMOVE on shared mapping of file\n"); + validate_free_pages(free_hugepages); (void)munmap(addr, NR_HUGE_PAGES * huge_page_size); /* * Test MADV_REMOVE on shared and private mapping of hugetlb file */ - if (fallocate(fd, 0, 0, NR_HUGE_PAGES * huge_page_size)) { - perror("fallocate"); - exit(1); - } + if (fallocate(fd, 0, 0, NR_HUGE_PAGES * huge_page_size)) + ksft_exit_fail_msg("fallocate: %s\n", strerror(errno)); + validate_free_pages(free_hugepages - NR_HUGE_PAGES); addr = mmap(NULL, NR_HUGE_PAGES * huge_page_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); - if (addr == MAP_FAILED) { - perror("mmap"); - exit(1); - } + if (addr == MAP_FAILED) + ksft_exit_fail_msg("mmap: %s\n", strerror(errno)); /* shared write should not consume any additional pages */ write_fault_pages(addr, NR_HUGE_PAGES); @@ -317,10 +275,8 @@ int main(int argc, char **argv) addr2 = mmap(NULL, NR_HUGE_PAGES * huge_page_size, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0); - if (addr2 == MAP_FAILED) { - perror("mmap"); - exit(1); - } + if (addr2 == MAP_FAILED) + ksft_exit_fail_msg("mmap: %s\n", strerror(errno)); /* private read should not consume any pages */ read_fault_pages(addr2, NR_HUGE_PAGES); @@ -331,17 +287,15 @@ int main(int argc, char **argv) validate_free_pages(free_hugepages - (2 * NR_HUGE_PAGES)); /* madvise of shared mapping should not free any pages */ - if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) { - perror("madvise"); - exit(1); - } + ret = madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED); + ksft_test_result(!ret, "The madvise MADV_REMOVE on shared mapping of file\n"); + validate_free_pages(free_hugepages - (2 * NR_HUGE_PAGES)); /* madvise of private mapping should free private pages */ - if (madvise(addr2, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) { - perror("madvise"); - exit(1); - } + ret = madvise(addr2, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED); + ksft_test_result(!ret, "The madvise MADV_REMOVE on shared mapping of file\n"); + validate_free_pages(free_hugepages - NR_HUGE_PAGES); /* private write should consume additional pages again */ @@ -353,15 +307,14 @@ int main(int argc, char **argv) * not correct. private pages should not be freed, but this is * expected. See comment associated with FALLOC_FL_PUNCH_HOLE call. */ - if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_REMOVE)) { - perror("madvise"); - exit(1); - } + ret = madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_REMOVE); + ksft_test_result(!ret, "The madvise MADV_REMOVE on shared mapping of file\n"); + validate_free_pages(free_hugepages); (void)munmap(addr, NR_HUGE_PAGES * huge_page_size); (void)munmap(addr2, NR_HUGE_PAGES * huge_page_size); close(fd); - return 0; + ksft_finished(); } From eb8bb77ecc5a0e104527b7e41791f2d94f044f93 Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Fri, 26 Jan 2024 16:21:23 +0500 Subject: [PATCH 624/707] selftests/mm: khugepaged: conform test to TAP format output Conform the layout, informational and status messages to TAP. No functional change is intended other than the layout of output messages. Skip test if root privileges aren't provided. Link: https://lkml.kernel.org/r/20240126112129.1480265-4-usama.anjum@collabora.com Signed-off-by: Muhammad Usama Anjum Cc: Jiaqi Yan Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/khugepaged.c | 385 ++++++++++-------------- 1 file changed, 163 insertions(+), 222 deletions(-) diff --git a/tools/testing/selftests/mm/khugepaged.c b/tools/testing/selftests/mm/khugepaged.c index 829320a519e723..d51fdaee7dc6ae 100644 --- a/tools/testing/selftests/mm/khugepaged.c +++ b/tools/testing/selftests/mm/khugepaged.c @@ -23,6 +23,7 @@ #include "vm_util.h" #include "thp_settings.h" +#include "../kselftest.h" #define BASE_ADDR ((void *)(1UL << 30)) static unsigned long hpage_pmd_size; @@ -73,22 +74,20 @@ struct file_info { static struct file_info finfo; static bool skip_settings_restore; -static int exit_status; static void success(const char *msg) { - printf(" \e[32m%s\e[0m\n", msg); + ksft_test_result_pass("%s\n", msg); } static void fail(const char *msg) { - printf(" \e[31m%s\e[0m\n", msg); - exit_status++; + ksft_test_result_fail("%s\n", msg); } static void skip(const char *msg) { - printf(" \e[33m%s\e[0m\n", msg); + ksft_test_result_skip("\e%s\n", msg); } static void restore_settings_atexit(void) @@ -96,9 +95,8 @@ static void restore_settings_atexit(void) if (skip_settings_restore) return; - printf("Restore THP and khugepaged settings..."); thp_restore_settings(); - success("OK"); + ksft_print_msg("Restored THP and khugepaged settings...\n"); skip_settings_restore = true; } @@ -106,12 +104,12 @@ static void restore_settings_atexit(void) static void restore_settings(int sig) { /* exit() will invoke the restore_settings_atexit handler. */ - exit(sig ? EXIT_FAILURE : exit_status); + ksft_finished(); } static void save_settings(void) { - printf("Save THP and khugepaged settings..."); + ksft_print_msg("Save THP and khugepaged settings...\n"); if (file_ops && finfo.type == VMA_FILE) thp_set_read_ahead_path(finfo.dev_queue_read_ahead_path); thp_save_settings(); @@ -135,60 +133,50 @@ static void get_finfo(const char *dir) finfo.dir = dir; stat(finfo.dir, &path_stat); - if (!S_ISDIR(path_stat.st_mode)) { - printf("%s: Not a directory (%s)\n", __func__, finfo.dir); - exit(EXIT_FAILURE); - } + if (!S_ISDIR(path_stat.st_mode)) + ksft_exit_fail_msg("%s: Not a directory (%s)\n", __func__, finfo.dir); + if (snprintf(finfo.path, sizeof(finfo.path), "%s/" TEST_FILE, - finfo.dir) >= sizeof(finfo.path)) { - printf("%s: Pathname is too long\n", __func__); - exit(EXIT_FAILURE); - } - if (statfs(finfo.dir, &fs)) { - perror("statfs()"); - exit(EXIT_FAILURE); - } + finfo.dir) >= sizeof(finfo.path)) + ksft_exit_fail_msg("%s: Pathname is too long\n", __func__); + + if (statfs(finfo.dir, &fs)) + ksft_exit_fail_msg("statfs(): %s\n", strerror(errno)); + finfo.type = fs.f_type == TMPFS_MAGIC ? VMA_SHMEM : VMA_FILE; if (finfo.type == VMA_SHMEM) return; /* Find owning device's queue/read_ahead_kb control */ if (snprintf(path, sizeof(path), "/sys/dev/block/%d:%d/uevent", - major(path_stat.st_dev), minor(path_stat.st_dev)) - >= sizeof(path)) { - printf("%s: Pathname is too long\n", __func__); - exit(EXIT_FAILURE); - } - if (read_file(path, buf, sizeof(buf)) < 0) { - perror("read_file(read_num)"); - exit(EXIT_FAILURE); - } + major(path_stat.st_dev), minor(path_stat.st_dev)) >= sizeof(path)) + ksft_exit_fail_msg("%s: Pathname is too long\n", __func__); + + if (read_file(path, buf, sizeof(buf)) < 0) + ksft_exit_fail_msg("read_file(read_num): %s\n", strerror(errno)); + if (strstr(buf, "DEVTYPE=disk")) { /* Found it */ if (snprintf(finfo.dev_queue_read_ahead_path, sizeof(finfo.dev_queue_read_ahead_path), "/sys/dev/block/%d:%d/queue/read_ahead_kb", major(path_stat.st_dev), minor(path_stat.st_dev)) - >= sizeof(finfo.dev_queue_read_ahead_path)) { - printf("%s: Pathname is too long\n", __func__); - exit(EXIT_FAILURE); - } + >= sizeof(finfo.dev_queue_read_ahead_path)) + ksft_exit_fail_msg("%s: Pathname is too long: %s\n", __func__, + strerror(errno)); return; } - if (!strstr(buf, "DEVTYPE=partition")) { - printf("%s: Unknown device type: %s\n", __func__, path); - exit(EXIT_FAILURE); - } + if (!strstr(buf, "DEVTYPE=partition")) + ksft_exit_fail_msg("%s: Unknown device type: %s\n", __func__, path); /* * Partition of block device - need to find actual device. * Using naming convention that devnameN is partition of * device devname. */ str = strstr(buf, "DEVNAME="); - if (!str) { - printf("%s: Could not read: %s", __func__, path); - exit(EXIT_FAILURE); - } + if (!str) + ksft_exit_fail_msg("%s: Could not read: %s", __func__, path); + str += 8; end = str; while (*end) { @@ -197,16 +185,14 @@ static void get_finfo(const char *dir) if (snprintf(finfo.dev_queue_read_ahead_path, sizeof(finfo.dev_queue_read_ahead_path), "/sys/block/%s/queue/read_ahead_kb", - str) >= sizeof(finfo.dev_queue_read_ahead_path)) { - printf("%s: Pathname is too long\n", __func__); - exit(EXIT_FAILURE); - } + str) >= sizeof(finfo.dev_queue_read_ahead_path)) + ksft_exit_fail_msg("%s: Pathname is too long\n", __func__); + return; } ++end; } - printf("%s: Could not read: %s\n", __func__, path); - exit(EXIT_FAILURE); + ksft_exit_fail_msg("%s: Could not read: %s\n", __func__, path); } static bool check_swap(void *addr, unsigned long size) @@ -219,26 +205,21 @@ static bool check_swap(void *addr, unsigned long size) ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "%08lx-", (unsigned long) addr); - if (ret >= MAX_LINE_LENGTH) { - printf("%s: Pattern is too long\n", __func__); - exit(EXIT_FAILURE); - } - + if (ret >= MAX_LINE_LENGTH) + ksft_exit_fail_msg("%s: Pattern is too long\n", __func__); fp = fopen(PID_SMAPS, "r"); - if (!fp) { - printf("%s: Failed to open file %s\n", __func__, PID_SMAPS); - exit(EXIT_FAILURE); - } + if (!fp) + ksft_exit_fail_msg("%s: Failed to open file %s\n", __func__, PID_SMAPS); + if (!check_for_pattern(fp, addr_pattern, buffer, sizeof(buffer))) goto err_out; ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "Swap:%19ld kB", size >> 10); - if (ret >= MAX_LINE_LENGTH) { - printf("%s: Pattern is too long\n", __func__); - exit(EXIT_FAILURE); - } + if (ret >= MAX_LINE_LENGTH) + ksft_exit_fail_msg("%s: Pattern is too long\n", __func__); + /* * Fetch the Swap: in the same block and check whether it got * the expected number of hugeepages next. @@ -261,10 +242,8 @@ static void *alloc_mapping(int nr) p = mmap(BASE_ADDR, nr * hpage_pmd_size, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); - if (p != BASE_ADDR) { - printf("Failed to allocate VMA at %p\n", BASE_ADDR); - exit(EXIT_FAILURE); - } + if (p != BASE_ADDR) + ksft_exit_fail_msg("Failed to allocate VMA at %p\n", BASE_ADDR); return p; } @@ -314,19 +293,16 @@ static void *alloc_hpage(struct mem_ops *ops) * khugepaged on low-load system (like a test machine), which * would cause MADV_COLLAPSE to fail with EAGAIN. */ - printf("Allocate huge page..."); - if (madvise_collapse_retry(p, hpage_pmd_size)) { - perror("madvise(MADV_COLLAPSE)"); - exit(EXIT_FAILURE); - } - if (!ops->check_huge(p, 1)) { - perror("madvise(MADV_COLLAPSE)"); - exit(EXIT_FAILURE); - } - if (madvise(p, hpage_pmd_size, MADV_HUGEPAGE)) { - perror("madvise(MADV_HUGEPAGE)"); - exit(EXIT_FAILURE); - } + ksft_print_msg("Allocate huge page...\n"); + if (madvise_collapse_retry(p, hpage_pmd_size)) + ksft_exit_fail_msg("madvise(MADV_COLLAPSE): %s\n", strerror(errno)); + + if (!ops->check_huge(p, 1)) + ksft_exit_fail_msg("madvise(MADV_COLLAPSE): %s\n", strerror(errno)); + + if (madvise(p, hpage_pmd_size, MADV_HUGEPAGE)) + ksft_exit_fail_msg("madvise(MADV_HUGEPAGE): %s\n", strerror(errno)); + success("OK"); return p; } @@ -335,13 +311,12 @@ static void validate_memory(int *p, unsigned long start, unsigned long end) { int i; - for (i = start / page_size; i < end / page_size; i++) { - if (p[i * page_size / sizeof(*p)] != i + 0xdead0000) { - printf("Page %d is corrupted: %#x\n", - i, p[i * page_size / sizeof(*p)]); - exit(EXIT_FAILURE); - } - } + for (i = start / page_size; i < end / page_size; i++) + if (p[i * page_size / sizeof(*p)] != i + 0xdead0000) + ksft_print_msg("Page %d is corrupted: %#x\n", + i, p[i * page_size / sizeof(*p)]); + + ksft_test_result(i == end/page_size, "Validated memory\n"); } static void *anon_setup_area(int nr_hpages) @@ -371,14 +346,12 @@ static void *file_setup_area(int nr_hpages) unsigned long size; unlink(finfo.path); /* Cleanup from previous failed tests */ - printf("Creating %s for collapse%s...", finfo.path, - finfo.type == VMA_SHMEM ? " (tmpfs)" : ""); + ksft_print_msg("Creating %s for collapse%s...\n", finfo.path, + finfo.type == VMA_SHMEM ? " (tmpfs)" : ""); fd = open(finfo.path, O_DSYNC | O_CREAT | O_RDWR | O_TRUNC | O_EXCL, 777); - if (fd < 0) { - perror("open()"); - exit(EXIT_FAILURE); - } + if (fd < 0) + ksft_exit_fail_msg("open(): %s\n", strerror(errno)); size = nr_hpages * hpage_pmd_size; p = alloc_mapping(nr_hpages); @@ -388,18 +361,15 @@ static void *file_setup_area(int nr_hpages) munmap(p, size); success("OK"); - printf("Opening %s read only for collapse...", finfo.path); + ksft_print_msg("Opening %s read only for collapse...\n", finfo.path); finfo.fd = open(finfo.path, O_RDONLY, 777); - if (finfo.fd < 0) { - perror("open()"); - exit(EXIT_FAILURE); - } + if (finfo.fd < 0) + ksft_exit_fail_msg("open(): %s\n", strerror(errno)); + p = mmap(BASE_ADDR, size, PROT_READ | PROT_EXEC, MAP_PRIVATE, finfo.fd, 0); - if (p == MAP_FAILED || p != BASE_ADDR) { - perror("mmap()"); - exit(EXIT_FAILURE); - } + if (p == MAP_FAILED || p != BASE_ADDR) + ksft_exit_fail_msg("mmap(): %s\n", strerror(errno)); /* Drop page cache */ write_file("/proc/sys/vm/drop_caches", "3", 2); @@ -416,10 +386,8 @@ static void file_cleanup_area(void *p, unsigned long size) static void file_fault(void *p, unsigned long start, unsigned long end) { - if (madvise(((char *)p) + start, end - start, MADV_POPULATE_READ)) { - perror("madvise(MADV_POPULATE_READ"); - exit(EXIT_FAILURE); - } + if (madvise(((char *)p) + start, end - start, MADV_POPULATE_READ)) + ksft_exit_fail_msg("madvise(MADV_POPULATE_READ: %s\n", strerror(errno)); } static bool file_check_huge(void *addr, int nr_hpages) @@ -430,7 +398,7 @@ static bool file_check_huge(void *addr, int nr_hpages) case VMA_SHMEM: return check_huge_shmem(addr, nr_hpages, hpage_pmd_size); default: - exit(EXIT_FAILURE); + ksft_exit_fail_msg("Wrong type\n"); return false; } } @@ -441,20 +409,16 @@ static void *shmem_setup_area(int nr_hpages) unsigned long size = nr_hpages * hpage_pmd_size; finfo.fd = memfd_create("khugepaged-selftest-collapse-shmem", 0); - if (finfo.fd < 0) { - perror("memfd_create()"); - exit(EXIT_FAILURE); - } - if (ftruncate(finfo.fd, size)) { - perror("ftruncate()"); - exit(EXIT_FAILURE); - } - p = mmap(BASE_ADDR, size, PROT_READ | PROT_WRITE, MAP_SHARED, finfo.fd, - 0); - if (p != BASE_ADDR) { - perror("mmap()"); - exit(EXIT_FAILURE); - } + if (finfo.fd < 0) + ksft_exit_fail_msg("memfd_create(): %s\n", strerror(errno)); + + if (ftruncate(finfo.fd, size)) + ksft_exit_fail_msg("ftruncate(): %s\n", strerror(errno)); + + p = mmap(BASE_ADDR, size, PROT_READ | PROT_WRITE, MAP_SHARED, finfo.fd, 0); + if (p != BASE_ADDR) + ksft_exit_fail_msg("mmap(): %s\n", strerror(errno)); + return p; } @@ -499,7 +463,7 @@ static void __madvise_collapse(const char *msg, char *p, int nr_hpages, int ret; struct thp_settings settings = *thp_current_settings(); - printf("%s...", msg); + ksft_print_msg("%s...\n", msg); /* * Prevent khugepaged interference and tests that MADV_COLLAPSE @@ -526,10 +490,9 @@ static void madvise_collapse(const char *msg, char *p, int nr_hpages, struct mem_ops *ops, bool expect) { /* Sanity check */ - if (!ops->check_huge(p, 0)) { - printf("Unexpected huge page\n"); - exit(EXIT_FAILURE); - } + if (!ops->check_huge(p, 0)) + ksft_exit_fail_msg("Unexpected huge page\n"); + __madvise_collapse(msg, p, nr_hpages, ops, expect); } @@ -541,23 +504,20 @@ static bool wait_for_scan(const char *msg, char *p, int nr_hpages, int timeout = 6; /* 3 seconds */ /* Sanity check */ - if (!ops->check_huge(p, 0)) { - printf("Unexpected huge page\n"); - exit(EXIT_FAILURE); - } + if (!ops->check_huge(p, 0)) + ksft_exit_fail_msg("Unexpected huge page\n"); madvise(p, nr_hpages * hpage_pmd_size, MADV_HUGEPAGE); /* Wait until the second full_scan completed */ full_scans = thp_read_num("khugepaged/full_scans") + 2; - printf("%s...", msg); + ksft_print_msg("%s...\n", msg); while (timeout--) { if (ops->check_huge(p, nr_hpages)) break; if (thp_read_num("khugepaged/full_scans") >= full_scans) break; - printf("."); usleep(TICK); } @@ -623,7 +583,7 @@ static void alloc_at_fault(void) p = alloc_mapping(1); *p = 1; - printf("Allocate huge page on fault..."); + ksft_print_msg("Allocate huge page on fault...\n"); if (check_huge_anon(p, 1, hpage_pmd_size)) success("OK"); else @@ -632,7 +592,7 @@ static void alloc_at_fault(void) thp_pop_settings(); madvise(p, page_size, MADV_DONTNEED); - printf("Split huge PMD on MADV_DONTNEED..."); + ksft_print_msg("Split huge PMD on MADV_DONTNEED...\n"); if (check_huge_anon(p, 0, hpage_pmd_size)) success("OK"); else @@ -688,7 +648,7 @@ static void collapse_max_ptes_none(struct collapse_context *c, struct mem_ops *o if (is_tmpfs(ops)) { /* shmem pages always in the page cache */ - printf("tmpfs..."); + ksft_print_msg("tmpfs...\n"); skip("Skip"); goto skip; } @@ -717,11 +677,10 @@ static void collapse_swapin_single_pte(struct collapse_context *c, struct mem_op p = ops->setup_area(1); ops->fault(p, 0, hpage_pmd_size); - printf("Swapout one page..."); - if (madvise(p, page_size, MADV_PAGEOUT)) { - perror("madvise(MADV_PAGEOUT)"); - exit(EXIT_FAILURE); - } + ksft_print_msg("Swapout one page...\n"); + if (madvise(p, page_size, MADV_PAGEOUT)) + ksft_exit_fail_msg("madvise(MADV_PAGEOUT): %s\n", strerror(errno)); + if (check_swap(p, page_size)) { success("OK"); } else { @@ -744,11 +703,10 @@ static void collapse_max_ptes_swap(struct collapse_context *c, struct mem_ops *o p = ops->setup_area(1); ops->fault(p, 0, hpage_pmd_size); - printf("Swapout %d of %d pages...", max_ptes_swap + 1, hpage_pmd_nr); - if (madvise(p, (max_ptes_swap + 1) * page_size, MADV_PAGEOUT)) { - perror("madvise(MADV_PAGEOUT)"); - exit(EXIT_FAILURE); - } + ksft_print_msg("Swapout %d of %d pages...\n", max_ptes_swap + 1, hpage_pmd_nr); + if (madvise(p, (max_ptes_swap + 1) * page_size, MADV_PAGEOUT)) + ksft_exit_fail_msg("madvise(MADV_PAGEOUT): %s\n", strerror(errno)); + if (check_swap(p, (max_ptes_swap + 1) * page_size)) { success("OK"); } else { @@ -762,12 +720,11 @@ static void collapse_max_ptes_swap(struct collapse_context *c, struct mem_ops *o if (c->enforce_pte_scan_limits) { ops->fault(p, 0, hpage_pmd_size); - printf("Swapout %d of %d pages...", max_ptes_swap, + ksft_print_msg("Swapout %d of %d pages...\n", max_ptes_swap, hpage_pmd_nr); - if (madvise(p, max_ptes_swap * page_size, MADV_PAGEOUT)) { - perror("madvise(MADV_PAGEOUT)"); - exit(EXIT_FAILURE); - } + if (madvise(p, max_ptes_swap * page_size, MADV_PAGEOUT)) + ksft_exit_fail_msg("madvise(MADV_PAGEOUT): %s\n", strerror(errno)); + if (check_swap(p, max_ptes_swap * page_size)) { success("OK"); } else { @@ -791,13 +748,13 @@ static void collapse_single_pte_entry_compound(struct collapse_context *c, struc if (is_tmpfs(ops)) { /* MADV_DONTNEED won't evict tmpfs pages */ - printf("tmpfs..."); + ksft_print_msg("tmpfs...\n"); skip("Skip"); goto skip; } madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE); - printf("Split huge page leaving single PTE mapping compound page..."); + ksft_print_msg("Split huge page leaving single PTE mapping compound page...\n"); madvise(p + page_size, hpage_pmd_size - page_size, MADV_DONTNEED); if (ops->check_huge(p, 0)) success("OK"); @@ -816,7 +773,7 @@ static void collapse_full_of_compound(struct collapse_context *c, struct mem_ops void *p; p = alloc_hpage(ops); - printf("Split huge page leaving single PTE page table full of compound pages..."); + ksft_print_msg("Split huge page leaving single PTE page table full of compound pages...\n"); madvise(p, page_size, MADV_NOHUGEPAGE); madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE); if (ops->check_huge(p, 0)) @@ -837,15 +794,14 @@ static void collapse_compound_extreme(struct collapse_context *c, struct mem_ops p = ops->setup_area(1); for (i = 0; i < hpage_pmd_nr; i++) { - printf("\rConstruct PTE page table full of different PTE-mapped compound pages %3d/%d...", - i + 1, hpage_pmd_nr); + ksft_print_msg("\rConstruct PTE page table full of different PTE-mapped " + "compound pages %3d/%d...", i + 1, hpage_pmd_nr); madvise(BASE_ADDR, hpage_pmd_size, MADV_HUGEPAGE); ops->fault(BASE_ADDR, 0, hpage_pmd_size); - if (!ops->check_huge(BASE_ADDR, 1)) { - printf("Failed to allocate huge page\n"); - exit(EXIT_FAILURE); - } + if (!ops->check_huge(BASE_ADDR, 1)) + ksft_exit_fail_msg("Failed to allocate huge page\n"); + madvise(BASE_ADDR, hpage_pmd_size, MADV_NOHUGEPAGE); p = mremap(BASE_ADDR - i * page_size, @@ -853,22 +809,20 @@ static void collapse_compound_extreme(struct collapse_context *c, struct mem_ops (i + 1) * page_size, MREMAP_MAYMOVE | MREMAP_FIXED, BASE_ADDR + 2 * hpage_pmd_size); - if (p == MAP_FAILED) { - perror("mremap+unmap"); - exit(EXIT_FAILURE); - } + if (p == MAP_FAILED) + ksft_exit_fail_msg("mremap+unmap: %s\n", strerror(errno)); p = mremap(BASE_ADDR + 2 * hpage_pmd_size, (i + 1) * page_size, (i + 1) * page_size + hpage_pmd_size, MREMAP_MAYMOVE | MREMAP_FIXED, BASE_ADDR - (i + 1) * page_size); - if (p == MAP_FAILED) { - perror("mremap+alloc"); - exit(EXIT_FAILURE); - } + if (p == MAP_FAILED) + ksft_exit_fail_msg("mremap+alloc: %s\n", strerror(errno)); } + ksft_print_msg("\n"); + ops->cleanup_area(BASE_ADDR, hpage_pmd_size); ops->fault(p, 0, hpage_pmd_size); if (!ops->check_huge(p, 1)) @@ -890,23 +844,19 @@ static void collapse_fork(struct collapse_context *c, struct mem_ops *ops) p = ops->setup_area(1); - printf("Allocate small page..."); + ksft_print_msg("Allocate small page...\n"); ops->fault(p, 0, page_size); if (ops->check_huge(p, 0)) success("OK"); else fail("Fail"); - printf("Share small page over fork()..."); + ksft_print_msg("Share small page over fork()...\n"); if (!fork()) { /* Do not touch settings on child exit */ skip_settings_restore = true; - exit_status = 0; - if (ops->check_huge(p, 0)) - success("OK"); - else - fail("Fail"); + ksft_test_result(ops->check_huge(p, 0), "%s: child\n", __func__); ops->fault(p, page_size, 2 * page_size); c->collapse("Collapse PTE table with single page shared with parent process", @@ -914,13 +864,12 @@ static void collapse_fork(struct collapse_context *c, struct mem_ops *ops) validate_memory(p, 0, page_size); ops->cleanup_area(p, hpage_pmd_size); - exit(exit_status); + exit(0); } wait(&wstatus); - exit_status += WEXITSTATUS(wstatus); - printf("Check if parent still has small page..."); + ksft_print_msg("Check if parent still has small page...\n"); if (ops->check_huge(p, 0)) success("OK"); else @@ -931,22 +880,17 @@ static void collapse_fork(struct collapse_context *c, struct mem_ops *ops) static void collapse_fork_compound(struct collapse_context *c, struct mem_ops *ops) { - int wstatus; void *p; p = alloc_hpage(ops); - printf("Share huge page over fork()..."); + ksft_print_msg("Share huge page over fork()...\n"); if (!fork()) { /* Do not touch settings on child exit */ skip_settings_restore = true; - exit_status = 0; - if (ops->check_huge(p, 1)) - success("OK"); - else - fail("Fail"); + ksft_test_result(ops->check_huge(p, 1), "%s: child\n", __func__); - printf("Split huge page PMD in child process..."); + ksft_print_msg("Split huge page PMD in child process...\n"); madvise(p, page_size, MADV_NOHUGEPAGE); madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE); if (ops->check_huge(p, 0)) @@ -963,13 +907,12 @@ static void collapse_fork_compound(struct collapse_context *c, struct mem_ops *o validate_memory(p, 0, hpage_pmd_size); ops->cleanup_area(p, hpage_pmd_size); - exit(exit_status); + exit(0); } - wait(&wstatus); - exit_status += WEXITSTATUS(wstatus); + wait(NULL); - printf("Check if parent still has huge page..."); + ksft_print_msg("Check if parent still has huge page...\n"); if (ops->check_huge(p, 1)) success("OK"); else @@ -981,23 +924,18 @@ static void collapse_fork_compound(struct collapse_context *c, struct mem_ops *o static void collapse_max_ptes_shared(struct collapse_context *c, struct mem_ops *ops) { int max_ptes_shared = thp_read_num("khugepaged/max_ptes_shared"); - int wstatus; void *p; p = alloc_hpage(ops); - printf("Share huge page over fork()..."); + ksft_print_msg("Share huge page over fork()...\n"); if (!fork()) { /* Do not touch settings on child exit */ skip_settings_restore = true; - exit_status = 0; - if (ops->check_huge(p, 1)) - success("OK"); - else - fail("Fail"); + ksft_test_result(ops->check_huge(p, 1), "%s: child\n", __func__); - printf("Trigger CoW on page %d of %d...", - hpage_pmd_nr - max_ptes_shared - 1, hpage_pmd_nr); + ksft_print_msg("Trigger CoW on page %d of %d...\n", + hpage_pmd_nr - max_ptes_shared - 1, hpage_pmd_nr); ops->fault(p, 0, (hpage_pmd_nr - max_ptes_shared - 1) * page_size); if (ops->check_huge(p, 0)) success("OK"); @@ -1008,8 +946,8 @@ static void collapse_max_ptes_shared(struct collapse_context *c, struct mem_ops 1, ops, !c->enforce_pte_scan_limits); if (c->enforce_pte_scan_limits) { - printf("Trigger CoW on page %d of %d...", - hpage_pmd_nr - max_ptes_shared, hpage_pmd_nr); + ksft_print_msg("Trigger CoW on page %d of %d...\n", + hpage_pmd_nr - max_ptes_shared, hpage_pmd_nr); ops->fault(p, 0, (hpage_pmd_nr - max_ptes_shared) * page_size); if (ops->check_huge(p, 0)) @@ -1023,13 +961,12 @@ static void collapse_max_ptes_shared(struct collapse_context *c, struct mem_ops validate_memory(p, 0, hpage_pmd_size); ops->cleanup_area(p, hpage_pmd_size); - exit(exit_status); + exit(0); } - wait(&wstatus); - exit_status += WEXITSTATUS(wstatus); + wait(NULL); - printf("Check if parent still has huge page..."); + ksft_print_msg("Check if parent still has huge page...\n"); if (ops->check_huge(p, 1)) success("OK"); else @@ -1083,20 +1020,19 @@ static void madvise_retracted_page_tables(struct collapse_context *c, static void usage(void) { - fprintf(stderr, "\nUsage: ./khugepaged [OPTIONS] [dir]\n\n"); - fprintf(stderr, "\t\t: :\n"); - fprintf(stderr, "\t\t: [all|khugepaged|madvise]\n"); - fprintf(stderr, "\t\t: [all|anon|file|shmem]\n"); - fprintf(stderr, "\n\t\"file,all\" mem_type requires [dir] argument\n"); - fprintf(stderr, "\n\t\"file,all\" mem_type requires kernel built with\n"); - fprintf(stderr, "\tCONFIG_READ_ONLY_THP_FOR_FS=y\n"); - fprintf(stderr, "\n\tif [dir] is a (sub)directory of a tmpfs mount, tmpfs must be\n"); - fprintf(stderr, "\tmounted with huge=madvise option for khugepaged tests to work\n"); - fprintf(stderr, "\n\tSupported Options:\n"); - fprintf(stderr, "\t\t-h: This help message.\n"); - fprintf(stderr, "\t\t-s: mTHP size, expressed as page order.\n"); - fprintf(stderr, "\t\t Defaults to 0. Use this size for anon allocations.\n"); - exit(1); + ksft_print_msg("\nUsage: ./khugepaged [OPTIONS] [dir]\n\n"); + ksft_print_msg("\t\t: :\n"); + ksft_print_msg("\t\t: [all|khugepaged|madvise]\n"); + ksft_print_msg("\t\t: [all|anon|file|shmem]\n"); + ksft_print_msg("\n\t\"file,all\" mem_type requires [dir] argument\n"); + ksft_print_msg("\n\t\"file,all\" mem_type requires kernel built with\n"); + ksft_print_msg("\tCONFIG_READ_ONLY_THP_FOR_FS=y\n"); + ksft_print_msg("\n\tif [dir] is a (sub)directory of a tmpfs mount, tmpfs must be\n"); + ksft_print_msg("\tmounted with huge=madvise option for khugepaged tests to work\n"); + ksft_print_msg("\n\tSupported Options:\n"); + ksft_print_msg("\t\t-h: This help message.\n"); + ksft_print_msg("\t\t-s: mTHP size, expressed as page order.\n"); + ksft_exit_fail_msg("\t\t Defaults to 0. Use this size for anon allocations.\n"); } static void parse_test_type(int argc, char **argv) @@ -1190,16 +1126,21 @@ int main(int argc, char **argv) .read_ahead_kb = 0, }; + ksft_print_header(); + + if (getuid()) + ksft_finished(); + + ksft_set_plan(65); + parse_test_type(argc, argv); setbuf(stdout, NULL); page_size = getpagesize(); hpage_pmd_size = read_pmd_pagesize(); - if (!hpage_pmd_size) { - printf("Reading PMD pagesize failed"); - exit(EXIT_FAILURE); - } + if (!hpage_pmd_size) + ksft_exit_fail_msg("Reading PMD pagesize failed\n"); hpage_pmd_nr = hpage_pmd_size / page_size; hpage_pmd_order = __builtin_ctz(hpage_pmd_nr); @@ -1217,7 +1158,7 @@ int main(int argc, char **argv) #define TEST(t, c, o) do { \ if (c && o) { \ - printf("\nRun test: " #t " (%s:%s)\n", c->name, o->name); \ + ksft_print_msg("Run test: " #t " (%s:%s)\n", c->name, o->name); \ t(c, o); \ } \ } while (0) @@ -1281,5 +1222,5 @@ int main(int argc, char **argv) TEST(madvise_retracted_page_tables, madvise_context, file_ops); TEST(madvise_retracted_page_tables, madvise_context, shmem_ops); - restore_settings(0); + ksft_finished(); } From 0bd0fe075ee70089850fc819a3b01cc581bcae22 Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Fri, 26 Jan 2024 16:21:24 +0500 Subject: [PATCH 625/707] selftests/mm: hugetlb-read-hwpoison: conform test to TAP format output Conform the layout, informational and status messages to TAP. No functional change is intended other than the layout of output messages. Link: https://lkml.kernel.org/r/20240126112129.1480265-5-usama.anjum@collabora.com Reviewed-by: Jiaqi Yan Signed-off-by: Muhammad Usama Anjum Cc: Shuah Khan Signed-off-by: Andrew Morton --- .../selftests/mm/hugetlb-read-hwpoison.c | 116 ++++++++---------- 1 file changed, 54 insertions(+), 62 deletions(-) diff --git a/tools/testing/selftests/mm/hugetlb-read-hwpoison.c b/tools/testing/selftests/mm/hugetlb-read-hwpoison.c index ba6cc6f9cabcdf..23b41b88c6aff0 100644 --- a/tools/testing/selftests/mm/hugetlb-read-hwpoison.c +++ b/tools/testing/selftests/mm/hugetlb-read-hwpoison.c @@ -58,8 +58,8 @@ static bool verify_chunk(char *buf, size_t len, char val) for (i = 0; i < len; ++i) { if (buf[i] != val) { - printf(PREFIX ERROR_PREFIX "check fail: buf[%lu] = %u != %u\n", - i, buf[i], val); + ksft_print_msg(PREFIX ERROR_PREFIX "check fail: buf[%lu] = %u != %u\n", + i, buf[i], val); return false; } } @@ -75,21 +75,21 @@ static bool seek_read_hugepage_filemap(int fd, size_t len, size_t wr_chunk_size, ssize_t total_ret_count = 0; char val = offset / wr_chunk_size + offset % wr_chunk_size; - printf(PREFIX PREFIX "init val=%u with offset=0x%lx\n", val, offset); - printf(PREFIX PREFIX "expect to read 0x%lx bytes of data in total\n", - expected); + ksft_print_msg(PREFIX PREFIX "init val=%u with offset=0x%lx\n", val, offset); + ksft_print_msg(PREFIX PREFIX "expect to read 0x%lx bytes of data in total\n", + expected); if (lseek(fd, offset, SEEK_SET) < 0) { - perror(PREFIX ERROR_PREFIX "seek failed"); + ksft_perror(PREFIX ERROR_PREFIX "seek failed"); return false; } while (offset + total_ret_count < len) { ret_count = read(fd, buf, wr_chunk_size); if (ret_count == 0) { - printf(PREFIX PREFIX "read reach end of the file\n"); + ksft_print_msg(PREFIX PREFIX "read reach end of the file\n"); break; } else if (ret_count < 0) { - perror(PREFIX ERROR_PREFIX "read failed"); + ksft_perror(PREFIX ERROR_PREFIX "read failed"); break; } ++val; @@ -98,8 +98,8 @@ static bool seek_read_hugepage_filemap(int fd, size_t len, size_t wr_chunk_size, total_ret_count += ret_count; } - printf(PREFIX PREFIX "actually read 0x%lx bytes of data in total\n", - total_ret_count); + ksft_print_msg(PREFIX PREFIX "actually read 0x%lx bytes of data in total\n", + total_ret_count); return total_ret_count == expected; } @@ -112,15 +112,15 @@ static bool read_hugepage_filemap(int fd, size_t len, ssize_t total_ret_count = 0; char val = 0; - printf(PREFIX PREFIX "expect to read 0x%lx bytes of data in total\n", - expected); + ksft_print_msg(PREFIX PREFIX "expect to read 0x%lx bytes of data in total\n", + expected); while (total_ret_count < len) { ret_count = read(fd, buf, wr_chunk_size); if (ret_count == 0) { - printf(PREFIX PREFIX "read reach end of the file\n"); + ksft_print_msg(PREFIX PREFIX "read reach end of the file\n"); break; } else if (ret_count < 0) { - perror(PREFIX ERROR_PREFIX "read failed"); + ksft_perror(PREFIX ERROR_PREFIX "read failed"); break; } ++val; @@ -129,8 +129,8 @@ static bool read_hugepage_filemap(int fd, size_t len, total_ret_count += ret_count; } - printf(PREFIX PREFIX "actually read 0x%lx bytes of data in total\n", - total_ret_count); + ksft_print_msg(PREFIX PREFIX "actually read 0x%lx bytes of data in total\n", + total_ret_count); return total_ret_count == expected; } @@ -142,14 +142,14 @@ test_hugetlb_read(int fd, size_t len, size_t wr_chunk_size) char *filemap = NULL; if (ftruncate(fd, len) < 0) { - perror(PREFIX ERROR_PREFIX "ftruncate failed"); + ksft_perror(PREFIX ERROR_PREFIX "ftruncate failed"); return status; } filemap = mmap(NULL, len, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, 0); if (filemap == MAP_FAILED) { - perror(PREFIX ERROR_PREFIX "mmap for primary mapping failed"); + ksft_perror(PREFIX ERROR_PREFIX "mmap for primary mapping failed"); goto done; } @@ -162,7 +162,7 @@ test_hugetlb_read(int fd, size_t len, size_t wr_chunk_size) munmap(filemap, len); done: if (ftruncate(fd, 0) < 0) { - perror(PREFIX ERROR_PREFIX "ftruncate back to 0 failed"); + ksft_perror(PREFIX ERROR_PREFIX "ftruncate back to 0 failed"); status = TEST_FAILED; } @@ -179,14 +179,14 @@ test_hugetlb_read_hwpoison(int fd, size_t len, size_t wr_chunk_size, const unsigned long pagesize = getpagesize(); if (ftruncate(fd, len) < 0) { - perror(PREFIX ERROR_PREFIX "ftruncate failed"); + ksft_perror(PREFIX ERROR_PREFIX "ftruncate failed"); return status; } filemap = mmap(NULL, len, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, 0); if (filemap == MAP_FAILED) { - perror(PREFIX ERROR_PREFIX "mmap for primary mapping failed"); + ksft_perror(PREFIX ERROR_PREFIX "mmap for primary mapping failed"); goto done; } @@ -201,7 +201,7 @@ test_hugetlb_read_hwpoison(int fd, size_t len, size_t wr_chunk_size, */ hwp_addr = filemap + len / 2 + pagesize; if (madvise(hwp_addr, pagesize, MADV_HWPOISON) < 0) { - perror(PREFIX ERROR_PREFIX "MADV_HWPOISON failed"); + ksft_perror(PREFIX ERROR_PREFIX "MADV_HWPOISON failed"); goto unmap; } @@ -228,7 +228,7 @@ test_hugetlb_read_hwpoison(int fd, size_t len, size_t wr_chunk_size, munmap(filemap, len); done: if (ftruncate(fd, 0) < 0) { - perror(PREFIX ERROR_PREFIX "ftruncate back to 0 failed"); + ksft_perror(PREFIX ERROR_PREFIX "ftruncate back to 0 failed"); status = TEST_FAILED; } @@ -240,27 +240,32 @@ static int create_hugetlbfs_file(struct statfs *file_stat) int fd; fd = memfd_create("hugetlb_tmp", MFD_HUGETLB); - if (fd < 0) { - perror(PREFIX ERROR_PREFIX "could not open hugetlbfs file"); - return -1; - } + if (fd < 0) + ksft_exit_fail_msg(PREFIX ERROR_PREFIX "could not open hugetlbfs file: %s\n", + strerror(errno)); memset(file_stat, 0, sizeof(*file_stat)); + if (fstatfs(fd, file_stat)) { - perror(PREFIX ERROR_PREFIX "fstatfs failed"); - goto close; + close(fd); + ksft_exit_fail_msg(PREFIX ERROR_PREFIX "fstatfs failed: %s\n", strerror(errno)); } if (file_stat->f_type != HUGETLBFS_MAGIC) { - printf(PREFIX ERROR_PREFIX "not hugetlbfs file\n"); - goto close; + close(fd); + ksft_exit_fail_msg(PREFIX ERROR_PREFIX "not hugetlbfs file\n"); } return fd; -close: - close(fd); - return -1; } +#define KSFT_PRINT_MSG(status, fmt, ...) \ + do { \ + if (status == TEST_SKIPPED) \ + ksft_test_result_skip(fmt, __VA_ARGS__); \ + else \ + ksft_test_result(status == TEST_PASSED, fmt, __VA_ARGS__); \ + } while (0) + int main(void) { int fd; @@ -273,50 +278,37 @@ int main(void) }; size_t i; + ksft_print_header(); + ksft_set_plan(12); + for (i = 0; i < ARRAY_SIZE(wr_chunk_sizes); ++i) { - printf("Write/read chunk size=0x%lx\n", - wr_chunk_sizes[i]); + ksft_print_msg("Write/read chunk size=0x%lx\n", + wr_chunk_sizes[i]); fd = create_hugetlbfs_file(&file_stat); - if (fd < 0) - goto create_failure; - printf(PREFIX "HugeTLB read regression test...\n"); + ksft_print_msg(PREFIX "HugeTLB read regression test...\n"); status = test_hugetlb_read(fd, file_stat.f_bsize, wr_chunk_sizes[i]); - printf(PREFIX "HugeTLB read regression test...%s\n", - status_to_str(status)); + KSFT_PRINT_MSG(status, PREFIX "HugeTLB read regression test...%s\n", + status_to_str(status)); close(fd); - if (status == TEST_FAILED) - return -1; fd = create_hugetlbfs_file(&file_stat); - if (fd < 0) - goto create_failure; - printf(PREFIX "HugeTLB read HWPOISON test...\n"); + ksft_print_msg(PREFIX "HugeTLB read HWPOISON test...\n"); status = test_hugetlb_read_hwpoison(fd, file_stat.f_bsize, wr_chunk_sizes[i], false); - printf(PREFIX "HugeTLB read HWPOISON test...%s\n", - status_to_str(status)); + KSFT_PRINT_MSG(status, PREFIX "HugeTLB read HWPOISON test...%s\n", + status_to_str(status)); close(fd); - if (status == TEST_FAILED) - return -1; fd = create_hugetlbfs_file(&file_stat); - if (fd < 0) - goto create_failure; - printf(PREFIX "HugeTLB seek then read HWPOISON test...\n"); + ksft_print_msg(PREFIX "HugeTLB seek then read HWPOISON test...\n"); status = test_hugetlb_read_hwpoison(fd, file_stat.f_bsize, wr_chunk_sizes[i], true); - printf(PREFIX "HugeTLB seek then read HWPOISON test...%s\n", - status_to_str(status)); + KSFT_PRINT_MSG(status, PREFIX "HugeTLB seek then read HWPOISON test...%s\n", + status_to_str(status)); close(fd); - if (status == TEST_FAILED) - return -1; } - return 0; - -create_failure: - printf(ERROR_PREFIX "Abort test: failed to create hugetlbfs file\n"); - return -1; + ksft_finished(); } From c9850eacbd63e0cc246c29b1efd6664733575fb5 Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Fri, 26 Jan 2024 16:21:26 +0500 Subject: [PATCH 626/707] selftests/mm: config: add missing configs Add configurations which are needed for - hugetlb-read-hwpoison - ksm_functional_test and ksm_test Link: https://lkml.kernel.org/r/20240126112129.1480265-7-usama.anjum@collabora.com Signed-off-by: Muhammad Usama Anjum Cc: Jiaqi Yan Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/config | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/testing/selftests/mm/config b/tools/testing/selftests/mm/config index 4309916f629e36..d16a72036eb7f1 100644 --- a/tools/testing/selftests/mm/config +++ b/tools/testing/selftests/mm/config @@ -7,3 +7,6 @@ CONFIG_TEST_HMM=m CONFIG_GUP_TEST=y CONFIG_TRANSPARENT_HUGEPAGE=y CONFIG_MEM_SOFT_DIRTY=y +CONFIG_MEMORY_FAILURE=y +CONFIG_HWPOISON_INJECT=y +CONFIG_KSM=y From b4d47a53076e5b359545c1d7c011d0fa61d629f0 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 29 Jan 2024 10:37:29 -0800 Subject: [PATCH 627/707] iov_iter: avoid wrap-around instrumentation in copy_compat_iovec_from_user() The loop counter "i" in copy_compat_iovec_from_user() is an int, but because the nr_segs argument is unsigned long, the signed overflow sanitizer got worried "i" could wrap around. Instead of making "i" an unsigned long (which may enlarge the type size), switch both nr_segs and i to u32. There is no truncation with nr_segs since it is never larger than UIO_MAXIOV anyway. This keeps sanitizer instrumentation[1] out of a UACCESS path: vmlinux.o: warning: objtool: copy_compat_iovec_from_user+0xa9: call to __ubsan_handle_add_overflow() with UACCESS enabled Link: https://github.com/KSPP/linux/issues/26 [1] Link: https://lkml.kernel.org/r/20240129183729.work.991-kees@kernel.org Signed-off-by: Kees Cook Cc: Christian Brauner Cc: Alexander Viro Signed-off-by: Andrew Morton --- lib/iov_iter.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/lib/iov_iter.c b/lib/iov_iter.c index e0aa6b440ca5f4..d797a43dca91ac 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -1166,11 +1166,12 @@ const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags) EXPORT_SYMBOL(dup_iter); static __noclone int copy_compat_iovec_from_user(struct iovec *iov, - const struct iovec __user *uvec, unsigned long nr_segs) + const struct iovec __user *uvec, u32 nr_segs) { const struct compat_iovec __user *uiov = (const struct compat_iovec __user *)uvec; - int ret = -EFAULT, i; + int ret = -EFAULT; + u32 i; if (!user_access_begin(uiov, nr_segs * sizeof(*uiov))) return -EFAULT; From 1001c3c3a79fd285f0b2a1793206dde92a9bda54 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=BCnther=20Noack?= Date: Fri, 8 Dec 2023 16:51:17 +0100 Subject: [PATCH 628/707] selftests/landlock: Test IOCTL support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Exercises Landlock's IOCTL feature in different combinations of handling and permitting the rights LANDLOCK_ACCESS_FS_IOCTL, LANDLOCK_ACCESS_FS_READ_FILE, LANDLOCK_ACCESS_FS_WRITE_FILE and LANDLOCK_ACCESS_FS_READ_DIR, and in different combinations of using files and directories. Signed-off-by: Günther Noack Link: https://lore.kernel.org/r/20231208155121.1943775-6-gnoack@google.com [mic: Move down linux/fs.h include to fix build issue with old libc] Signed-off-by: Mickaël Salaün --- tools/testing/selftests/landlock/fs_test.c | 432 ++++++++++++++++++++- 1 file changed, 429 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/landlock/fs_test.c b/tools/testing/selftests/landlock/fs_test.c index 3203f4a5bc8595..8c1f08ad8c9114 100644 --- a/tools/testing/selftests/landlock/fs_test.c +++ b/tools/testing/selftests/landlock/fs_test.c @@ -23,6 +23,8 @@ #include #include +#include + #include "common.h" #ifndef renameat2 @@ -735,6 +737,9 @@ static int create_ruleset(struct __test_metadata *const _metadata, } for (i = 0; rules[i].path; i++) { + if (!rules[i].access) + continue; + add_path_beneath(_metadata, ruleset_fd, rules[i].access, rules[i].path); } @@ -3443,7 +3448,7 @@ TEST_F_FORK(layout1, truncate_unhandled) LANDLOCK_ACCESS_FS_WRITE_FILE; int ruleset_fd; - /* Enable Landlock. */ + /* Enables Landlock. */ ruleset_fd = create_ruleset(_metadata, handled, rules); ASSERT_LE(0, ruleset_fd); @@ -3526,7 +3531,7 @@ TEST_F_FORK(layout1, truncate) LANDLOCK_ACCESS_FS_TRUNCATE; int ruleset_fd; - /* Enable Landlock. */ + /* Enables Landlock. */ ruleset_fd = create_ruleset(_metadata, handled, rules); ASSERT_LE(0, ruleset_fd); @@ -3752,7 +3757,7 @@ TEST_F_FORK(ftruncate, open_and_ftruncate) }; int fd, ruleset_fd; - /* Enable Landlock. */ + /* Enables Landlock. */ ruleset_fd = create_ruleset(_metadata, variant->handled, rules); ASSERT_LE(0, ruleset_fd); enforce_ruleset(_metadata, ruleset_fd); @@ -3829,6 +3834,16 @@ TEST_F_FORK(ftruncate, open_and_ftruncate_in_different_processes) ASSERT_EQ(0, close(socket_fds[1])); } +/* Invokes the FS_IOC_GETFLAGS IOCTL and returns its errno or 0. */ +static int test_fs_ioc_getflags_ioctl(int fd) +{ + uint32_t flags; + + if (ioctl(fd, FS_IOC_GETFLAGS, &flags) < 0) + return errno; + return 0; +} + TEST(memfd_ftruncate) { int fd; @@ -3845,6 +3860,417 @@ TEST(memfd_ftruncate) ASSERT_EQ(0, close(fd)); } +/* clang-format off */ +FIXTURE(ioctl) {}; +/* clang-format on */ + +FIXTURE_SETUP(ioctl) +{ + prepare_layout(_metadata); + create_file(_metadata, file1_s1d1); +} + +FIXTURE_TEARDOWN(ioctl) +{ + EXPECT_EQ(0, remove_path(file1_s1d1)); + cleanup_layout(_metadata); +} + +FIXTURE_VARIANT(ioctl) +{ + const __u64 handled; + const __u64 allowed; + const mode_t open_mode; + /* + * These are the expected IOCTL results for a representative IOCTL from + * each of the IOCTL groups. We only distinguish the 0 and EACCES + * results here, and treat other errors as 0. + */ + const int expected_fioqsize_result; /* G1 */ + const int expected_fibmap_result; /* G2 */ + const int expected_fionread_result; /* G3 */ + const int expected_fs_ioc_zero_range_result; /* G4 */ + const int expected_fs_ioc_getflags_result; /* other */ +}; + +/* clang-format off */ +FIXTURE_VARIANT_ADD(ioctl, handled_i_allowed_none) { + /* clang-format on */ + .handled = LANDLOCK_ACCESS_FS_IOCTL, + .allowed = 0, + .open_mode = O_RDWR, + .expected_fioqsize_result = EACCES, + .expected_fibmap_result = EACCES, + .expected_fionread_result = EACCES, + .expected_fs_ioc_zero_range_result = EACCES, + .expected_fs_ioc_getflags_result = EACCES, +}; + +/* clang-format off */ +FIXTURE_VARIANT_ADD(ioctl, handled_i_allowed_i) { + /* clang-format on */ + .handled = LANDLOCK_ACCESS_FS_IOCTL, + .allowed = LANDLOCK_ACCESS_FS_IOCTL, + .open_mode = O_RDWR, + .expected_fioqsize_result = 0, + .expected_fibmap_result = 0, + .expected_fionread_result = 0, + .expected_fs_ioc_zero_range_result = 0, + .expected_fs_ioc_getflags_result = 0, +}; + +/* clang-format off */ +FIXTURE_VARIANT_ADD(ioctl, unhandled) { + /* clang-format on */ + .handled = LANDLOCK_ACCESS_FS_EXECUTE, + .allowed = LANDLOCK_ACCESS_FS_EXECUTE, + .open_mode = O_RDWR, + .expected_fioqsize_result = 0, + .expected_fibmap_result = 0, + .expected_fionread_result = 0, + .expected_fs_ioc_zero_range_result = 0, + .expected_fs_ioc_getflags_result = 0, +}; + +/* clang-format off */ +FIXTURE_VARIANT_ADD(ioctl, handled_rwd_allowed_r) { + /* clang-format on */ + .handled = LANDLOCK_ACCESS_FS_READ_FILE | + LANDLOCK_ACCESS_FS_WRITE_FILE | LANDLOCK_ACCESS_FS_READ_DIR, + .allowed = LANDLOCK_ACCESS_FS_READ_FILE, + .open_mode = O_RDONLY, + /* If LANDLOCK_ACCESS_FS_IOCTL is not handled, all IOCTLs work. */ + .expected_fioqsize_result = 0, + .expected_fibmap_result = 0, + .expected_fionread_result = 0, + .expected_fs_ioc_zero_range_result = 0, + .expected_fs_ioc_getflags_result = 0, +}; + +/* clang-format off */ +FIXTURE_VARIANT_ADD(ioctl, handled_rwd_allowed_w) { + /* clang-format on */ + .handled = LANDLOCK_ACCESS_FS_READ_FILE | + LANDLOCK_ACCESS_FS_WRITE_FILE | LANDLOCK_ACCESS_FS_READ_DIR, + .allowed = LANDLOCK_ACCESS_FS_WRITE_FILE, + .open_mode = O_WRONLY, + /* If LANDLOCK_ACCESS_FS_IOCTL is not handled, all IOCTLs work. */ + .expected_fioqsize_result = 0, + .expected_fibmap_result = 0, + .expected_fionread_result = 0, + .expected_fs_ioc_zero_range_result = 0, + .expected_fs_ioc_getflags_result = 0, +}; + +/* clang-format off */ +FIXTURE_VARIANT_ADD(ioctl, handled_ri_allowed_r) { + /* clang-format on */ + .handled = LANDLOCK_ACCESS_FS_READ_FILE | LANDLOCK_ACCESS_FS_IOCTL, + .allowed = LANDLOCK_ACCESS_FS_READ_FILE, + .open_mode = O_RDONLY, + .expected_fioqsize_result = 0, + .expected_fibmap_result = 0, + .expected_fionread_result = 0, + .expected_fs_ioc_zero_range_result = EACCES, + .expected_fs_ioc_getflags_result = EACCES, +}; + +/* clang-format off */ +FIXTURE_VARIANT_ADD(ioctl, handled_wi_allowed_w) { + /* clang-format on */ + .handled = LANDLOCK_ACCESS_FS_WRITE_FILE | LANDLOCK_ACCESS_FS_IOCTL, + .allowed = LANDLOCK_ACCESS_FS_WRITE_FILE, + .open_mode = O_WRONLY, + .expected_fioqsize_result = 0, + .expected_fibmap_result = 0, + .expected_fionread_result = EACCES, + .expected_fs_ioc_zero_range_result = 0, + .expected_fs_ioc_getflags_result = EACCES, +}; + +/* clang-format off */ +FIXTURE_VARIANT_ADD(ioctl, handled_di_allowed_d) { + /* clang-format on */ + .handled = LANDLOCK_ACCESS_FS_READ_DIR | LANDLOCK_ACCESS_FS_IOCTL, + .allowed = LANDLOCK_ACCESS_FS_READ_DIR, + .open_mode = O_RDWR, + .expected_fioqsize_result = 0, + .expected_fibmap_result = EACCES, + .expected_fionread_result = EACCES, + .expected_fs_ioc_zero_range_result = EACCES, + .expected_fs_ioc_getflags_result = EACCES, +}; + +/* clang-format off */ +FIXTURE_VARIANT_ADD(ioctl, handled_rwi_allowed_rw) { + /* clang-format on */ + .handled = LANDLOCK_ACCESS_FS_READ_FILE | + LANDLOCK_ACCESS_FS_WRITE_FILE | LANDLOCK_ACCESS_FS_IOCTL, + .allowed = LANDLOCK_ACCESS_FS_READ_FILE | LANDLOCK_ACCESS_FS_WRITE_FILE, + .open_mode = O_RDWR, + .expected_fioqsize_result = 0, + .expected_fibmap_result = 0, + .expected_fionread_result = 0, + .expected_fs_ioc_zero_range_result = 0, + .expected_fs_ioc_getflags_result = EACCES, +}; + +/* clang-format off */ +FIXTURE_VARIANT_ADD(ioctl, handled_rwi_allowed_r) { + /* clang-format on */ + .handled = LANDLOCK_ACCESS_FS_READ_FILE | + LANDLOCK_ACCESS_FS_WRITE_FILE | LANDLOCK_ACCESS_FS_IOCTL, + .allowed = LANDLOCK_ACCESS_FS_READ_FILE, + .open_mode = O_RDONLY, + .expected_fioqsize_result = 0, + .expected_fibmap_result = 0, + .expected_fionread_result = 0, + .expected_fs_ioc_zero_range_result = EACCES, + .expected_fs_ioc_getflags_result = EACCES, +}; + +/* clang-format off */ +FIXTURE_VARIANT_ADD(ioctl, handled_rwi_allowed_ri) { + /* clang-format on */ + .handled = LANDLOCK_ACCESS_FS_READ_FILE | + LANDLOCK_ACCESS_FS_WRITE_FILE | LANDLOCK_ACCESS_FS_IOCTL, + .allowed = LANDLOCK_ACCESS_FS_READ_FILE | LANDLOCK_ACCESS_FS_IOCTL, + .open_mode = O_RDONLY, + .expected_fioqsize_result = 0, + .expected_fibmap_result = 0, + .expected_fionread_result = 0, + .expected_fs_ioc_zero_range_result = EACCES, + .expected_fs_ioc_getflags_result = 0, +}; + +/* clang-format off */ +FIXTURE_VARIANT_ADD(ioctl, handled_rwi_allowed_w) { + /* clang-format on */ + .handled = LANDLOCK_ACCESS_FS_READ_FILE | + LANDLOCK_ACCESS_FS_WRITE_FILE | LANDLOCK_ACCESS_FS_IOCTL, + .allowed = LANDLOCK_ACCESS_FS_WRITE_FILE, + .open_mode = O_WRONLY, + .expected_fioqsize_result = 0, + .expected_fibmap_result = 0, + .expected_fionread_result = EACCES, + .expected_fs_ioc_zero_range_result = 0, + .expected_fs_ioc_getflags_result = EACCES, +}; + +/* clang-format off */ +FIXTURE_VARIANT_ADD(ioctl, handled_rwi_allowed_wi) { + /* clang-format on */ + .handled = LANDLOCK_ACCESS_FS_READ_FILE | + LANDLOCK_ACCESS_FS_WRITE_FILE | LANDLOCK_ACCESS_FS_IOCTL, + .allowed = LANDLOCK_ACCESS_FS_WRITE_FILE | LANDLOCK_ACCESS_FS_IOCTL, + .open_mode = O_WRONLY, + .expected_fioqsize_result = 0, + .expected_fibmap_result = 0, + .expected_fionread_result = EACCES, + .expected_fs_ioc_zero_range_result = 0, + .expected_fs_ioc_getflags_result = 0, +}; + +static int test_fioqsize_ioctl(int fd) +{ + size_t sz; + + if (ioctl(fd, FIOQSIZE, &sz) < 0) + return errno; + return 0; +} + +static int test_fibmap_ioctl(int fd) +{ + int blk = 0; + + /* + * We only want to distinguish here whether Landlock already caught it, + * so we treat anything but EACCESS as success. (It commonly returns + * EPERM when missing CAP_SYS_RAWIO.) + */ + if (ioctl(fd, FIBMAP, &blk) < 0 && errno == EACCES) + return errno; + return 0; +} + +static int test_fionread_ioctl(int fd) +{ + size_t sz = 0; + + if (ioctl(fd, FIONREAD, &sz) < 0 && errno == EACCES) + return errno; + return 0; +} + +#define FS_IOC_ZERO_RANGE _IOW('X', 57, struct space_resv) + +static int test_fs_ioc_zero_range_ioctl(int fd) +{ + struct space_resv { + __s16 l_type; + __s16 l_whence; + __s64 l_start; + __s64 l_len; /* len == 0 means until end of file */ + __s32 l_sysid; + __u32 l_pid; + __s32 l_pad[4]; /* reserved area */ + } reservation = {}; + /* + * This can fail for various reasons, but we only want to distinguish + * here whether Landlock already caught it, so we treat anything but + * EACCES as success. + */ + if (ioctl(fd, FS_IOC_ZERO_RANGE, &reservation) < 0 && errno == EACCES) + return errno; + return 0; +} + +TEST_F_FORK(ioctl, handle_dir_access_file) +{ + const int flag = 0; + const struct rule rules[] = { + { + .path = dir_s1d1, + .access = variant->allowed, + }, + {}, + }; + int file_fd, ruleset_fd; + + /* Enables Landlock. */ + ruleset_fd = create_ruleset(_metadata, variant->handled, rules); + ASSERT_LE(0, ruleset_fd); + enforce_ruleset(_metadata, ruleset_fd); + ASSERT_EQ(0, close(ruleset_fd)); + + file_fd = open(file1_s1d1, variant->open_mode); + ASSERT_LE(0, file_fd); + + /* + * Checks that IOCTL commands in each IOCTL group return the expected + * errors. + */ + EXPECT_EQ(variant->expected_fioqsize_result, + test_fioqsize_ioctl(file_fd)); + EXPECT_EQ(variant->expected_fibmap_result, test_fibmap_ioctl(file_fd)); + EXPECT_EQ(variant->expected_fionread_result, + test_fionread_ioctl(file_fd)); + EXPECT_EQ(variant->expected_fs_ioc_zero_range_result, + test_fs_ioc_zero_range_ioctl(file_fd)); + EXPECT_EQ(variant->expected_fs_ioc_getflags_result, + test_fs_ioc_getflags_ioctl(file_fd)); + + /* Checks that unrestrictable commands are unrestricted. */ + EXPECT_EQ(0, ioctl(file_fd, FIOCLEX)); + EXPECT_EQ(0, ioctl(file_fd, FIONCLEX)); + EXPECT_EQ(0, ioctl(file_fd, FIONBIO, &flag)); + EXPECT_EQ(0, ioctl(file_fd, FIOASYNC, &flag)); + + ASSERT_EQ(0, close(file_fd)); +} + +TEST_F_FORK(ioctl, handle_dir_access_dir) +{ + const char *const path = dir_s1d1; + const int flag = 0; + const struct rule rules[] = { + { + .path = path, + .access = variant->allowed, + }, + {}, + }; + int dir_fd, ruleset_fd; + + /* Enables Landlock. */ + ruleset_fd = create_ruleset(_metadata, variant->handled, rules); + ASSERT_LE(0, ruleset_fd); + enforce_ruleset(_metadata, ruleset_fd); + ASSERT_EQ(0, close(ruleset_fd)); + + /* + * Ignore variant->open_mode for this test, as we intend to open a + * directory. If the directory can not be opened, the variant is + * infeasible to test with an opened directory. + */ + dir_fd = open(path, O_RDONLY); + if (dir_fd < 0) + return; + + /* + * Checks that IOCTL commands in each IOCTL group return the expected + * errors. + */ + EXPECT_EQ(variant->expected_fioqsize_result, + test_fioqsize_ioctl(dir_fd)); + EXPECT_EQ(variant->expected_fibmap_result, test_fibmap_ioctl(dir_fd)); + EXPECT_EQ(variant->expected_fionread_result, + test_fionread_ioctl(dir_fd)); + EXPECT_EQ(variant->expected_fs_ioc_zero_range_result, + test_fs_ioc_zero_range_ioctl(dir_fd)); + EXPECT_EQ(variant->expected_fs_ioc_getflags_result, + test_fs_ioc_getflags_ioctl(dir_fd)); + + /* Checks that unrestrictable commands are unrestricted. */ + EXPECT_EQ(0, ioctl(dir_fd, FIOCLEX)); + EXPECT_EQ(0, ioctl(dir_fd, FIONCLEX)); + EXPECT_EQ(0, ioctl(dir_fd, FIONBIO, &flag)); + EXPECT_EQ(0, ioctl(dir_fd, FIOASYNC, &flag)); + + ASSERT_EQ(0, close(dir_fd)); +} + +TEST_F_FORK(ioctl, handle_file_access_file) +{ + const char *const path = file1_s1d1; + const int flag = 0; + const struct rule rules[] = { + { + .path = path, + .access = variant->allowed, + }, + {}, + }; + int file_fd, ruleset_fd; + + if (variant->allowed & LANDLOCK_ACCESS_FS_READ_DIR) { + SKIP(return, "LANDLOCK_ACCESS_FS_READ_DIR " + "can not be granted on files"); + } + + /* Enables Landlock. */ + ruleset_fd = create_ruleset(_metadata, variant->handled, rules); + ASSERT_LE(0, ruleset_fd); + enforce_ruleset(_metadata, ruleset_fd); + ASSERT_EQ(0, close(ruleset_fd)); + + file_fd = open(path, variant->open_mode); + ASSERT_LE(0, file_fd); + + /* + * Checks that IOCTL commands in each IOCTL group return the expected + * errors. + */ + EXPECT_EQ(variant->expected_fioqsize_result, + test_fioqsize_ioctl(file_fd)); + EXPECT_EQ(variant->expected_fibmap_result, test_fibmap_ioctl(file_fd)); + EXPECT_EQ(variant->expected_fionread_result, + test_fionread_ioctl(file_fd)); + EXPECT_EQ(variant->expected_fs_ioc_zero_range_result, + test_fs_ioc_zero_range_ioctl(file_fd)); + EXPECT_EQ(variant->expected_fs_ioc_getflags_result, + test_fs_ioc_getflags_ioctl(file_fd)); + + /* Checks that unrestrictable commands are unrestricted. */ + EXPECT_EQ(0, ioctl(file_fd, FIOCLEX)); + EXPECT_EQ(0, ioctl(file_fd, FIONCLEX)); + EXPECT_EQ(0, ioctl(file_fd, FIONBIO, &flag)); + EXPECT_EQ(0, ioctl(file_fd, FIOASYNC, &flag)); + + ASSERT_EQ(0, close(file_fd)); +} + /* clang-format off */ FIXTURE(layout1_bind) {}; /* clang-format on */ From 4c3146bc56d500f3a281d0f50f0888c91db5df40 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=BCnther=20Noack?= Date: Fri, 8 Dec 2023 16:51:18 +0100 Subject: [PATCH 629/707] selftests/landlock: Test IOCTL with memfds MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Because the LANDLOCK_ACCESS_FS_IOCTL right is associated with the opened file during open(2), IOCTLs are supposed to work with files which are opened by means other than open(2). Signed-off-by: Günther Noack Link: https://lore.kernel.org/r/20231208155121.1943775-7-gnoack@google.com Signed-off-by: Mickaël Salaün --- tools/testing/selftests/landlock/fs_test.c | 36 ++++++++++++++++------ 1 file changed, 27 insertions(+), 9 deletions(-) diff --git a/tools/testing/selftests/landlock/fs_test.c b/tools/testing/selftests/landlock/fs_test.c index 8c1f08ad8c9114..9afe37700f4741 100644 --- a/tools/testing/selftests/landlock/fs_test.c +++ b/tools/testing/selftests/landlock/fs_test.c @@ -3844,20 +3844,38 @@ static int test_fs_ioc_getflags_ioctl(int fd) return 0; } -TEST(memfd_ftruncate) +TEST(memfd_ftruncate_and_ioctl) { - int fd; - - fd = memfd_create("name", MFD_CLOEXEC); - ASSERT_LE(0, fd); + const struct landlock_ruleset_attr attr = { + .handled_access_fs = ACCESS_ALL, + }; + int ruleset_fd, fd, i; /* - * Checks that ftruncate is permitted on file descriptors that are - * created in ways other than open(2). + * We exercise the same test both with and without Landlock enabled, to + * ensure that it behaves the same in both cases. */ - EXPECT_EQ(0, test_ftruncate(fd)); + for (i = 0; i < 2; i++) { + /* Creates a new memfd. */ + fd = memfd_create("name", MFD_CLOEXEC); + ASSERT_LE(0, fd); - ASSERT_EQ(0, close(fd)); + /* + * Checks that operations associated with the opened file + * (ftruncate, ioctl) are permitted on file descriptors that are + * created in ways other than open(2). + */ + EXPECT_EQ(0, test_ftruncate(fd)); + EXPECT_EQ(0, test_fs_ioc_getflags_ioctl(fd)); + + ASSERT_EQ(0, close(fd)); + + /* Enables Landlock. */ + ruleset_fd = landlock_create_ruleset(&attr, sizeof(attr), 0); + ASSERT_LE(0, ruleset_fd); + enforce_ruleset(_metadata, ruleset_fd); + ASSERT_EQ(0, close(ruleset_fd)); + } } /* clang-format off */ From 289982f3e9cb6247e1cea35cc62a220ccb28e39f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=BCnther=20Noack?= Date: Fri, 8 Dec 2023 16:51:19 +0100 Subject: [PATCH 630/707] selftests/landlock: Test ioctl(2) and ftruncate(2) with open(O_PATH) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ioctl(2) and ftruncate(2) operations on files opened with O_PATH should always return EBADF, independent of the LANDLOCK_ACCESS_FS_TRUNCATE and LANDLOCK_ACCESS_FS_IOCTL access rights in that file hierarchy. Suggested-by: Mickaël Salaün Signed-off-by: Günther Noack Link: https://lore.kernel.org/r/20231208155121.1943775-8-gnoack@google.com Signed-off-by: Mickaël Salaün --- tools/testing/selftests/landlock/fs_test.c | 40 ++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/tools/testing/selftests/landlock/fs_test.c b/tools/testing/selftests/landlock/fs_test.c index 9afe37700f4741..1c2d158b1f0773 100644 --- a/tools/testing/selftests/landlock/fs_test.c +++ b/tools/testing/selftests/landlock/fs_test.c @@ -3878,6 +3878,46 @@ TEST(memfd_ftruncate_and_ioctl) } } +TEST_F_FORK(layout1, o_path_ftruncate_and_ioctl) +{ + const struct landlock_ruleset_attr attr = { + .handled_access_fs = ACCESS_ALL, + }; + int ruleset_fd, fd; + + /* + * Checks that for files opened with O_PATH, both ioctl(2) and + * ftruncate(2) yield EBADF, as it is documented in open(2) for the + * O_PATH flag. + */ + fd = open(dir_s1d1, O_PATH | O_CLOEXEC); + ASSERT_LE(0, fd); + + EXPECT_EQ(EBADF, test_ftruncate(fd)); + EXPECT_EQ(EBADF, test_fs_ioc_getflags_ioctl(fd)); + + ASSERT_EQ(0, close(fd)); + + /* Enables Landlock. */ + ruleset_fd = landlock_create_ruleset(&attr, sizeof(attr), 0); + ASSERT_LE(0, ruleset_fd); + enforce_ruleset(_metadata, ruleset_fd); + ASSERT_EQ(0, close(ruleset_fd)); + + /* + * Checks that after enabling Landlock, + * - the file can still be opened with O_PATH + * - both ioctl and truncate still yield EBADF (not EACCES). + */ + fd = open(dir_s1d1, O_PATH | O_CLOEXEC); + ASSERT_LE(0, fd); + + EXPECT_EQ(EBADF, test_ftruncate(fd)); + EXPECT_EQ(EBADF, test_fs_ioc_getflags_ioctl(fd)); + + ASSERT_EQ(0, close(fd)); +} + /* clang-format off */ FIXTURE(ioctl) {}; /* clang-format on */ From 19b18b218c2c9aca57a73df45c7db0beab03b2e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=BCnther=20Noack?= Date: Fri, 8 Dec 2023 16:51:20 +0100 Subject: [PATCH 631/707] samples/landlock: Add support for LANDLOCK_ACCESS_FS_IOCTL MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add ioctl support to the Landlock sample tool. The ioctl right is grouped with the read-write rights in the sample tool, as some ioctl requests provide features that mutate state. Signed-off-by: Günther Noack Link: https://lore.kernel.org/r/20231208155121.1943775-9-gnoack@google.com Signed-off-by: Mickaël Salaün --- samples/landlock/sandboxer.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/samples/landlock/sandboxer.c b/samples/landlock/sandboxer.c index 08596c0ef0707c..d7323e5526be29 100644 --- a/samples/landlock/sandboxer.c +++ b/samples/landlock/sandboxer.c @@ -81,7 +81,8 @@ static int parse_path(char *env_path, const char ***const path_list) LANDLOCK_ACCESS_FS_EXECUTE | \ LANDLOCK_ACCESS_FS_WRITE_FILE | \ LANDLOCK_ACCESS_FS_READ_FILE | \ - LANDLOCK_ACCESS_FS_TRUNCATE) + LANDLOCK_ACCESS_FS_TRUNCATE | \ + LANDLOCK_ACCESS_FS_IOCTL) /* clang-format on */ @@ -199,11 +200,12 @@ static int populate_ruleset_net(const char *const env_var, const int ruleset_fd, LANDLOCK_ACCESS_FS_MAKE_BLOCK | \ LANDLOCK_ACCESS_FS_MAKE_SYM | \ LANDLOCK_ACCESS_FS_REFER | \ - LANDLOCK_ACCESS_FS_TRUNCATE) + LANDLOCK_ACCESS_FS_TRUNCATE | \ + LANDLOCK_ACCESS_FS_IOCTL) /* clang-format on */ -#define LANDLOCK_ABI_LAST 4 +#define LANDLOCK_ABI_LAST 5 int main(const int argc, char *const argv[], char *const *const envp) { @@ -317,6 +319,11 @@ int main(const int argc, char *const argv[], char *const *const envp) ruleset_attr.handled_access_net &= ~(LANDLOCK_ACCESS_NET_BIND_TCP | LANDLOCK_ACCESS_NET_CONNECT_TCP); + __attribute__((fallthrough)); + case 4: + /* Removes LANDLOCK_ACCESS_FS_IOCTL for ABI < 5 */ + ruleset_attr.handled_access_fs &= ~LANDLOCK_ACCESS_FS_IOCTL; + fprintf(stderr, "Hint: You should update the running kernel " "to leverage Landlock features " From 2f8bb71d737c25ca97a83e47b445837aa96cec77 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=BCnther=20Noack?= Date: Fri, 8 Dec 2023 16:51:21 +0100 Subject: [PATCH 632/707] landlock: Document IOCTL support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In the paragraph above the fallback logic, use the shorter phrasing from the landlock(7) man page. Signed-off-by: Günther Noack Link: https://lore.kernel.org/r/20231208155121.1943775-10-gnoack@google.com Signed-off-by: Mickaël Salaün --- Documentation/userspace-api/landlock.rst | 119 ++++++++++++++++++++--- 1 file changed, 104 insertions(+), 15 deletions(-) diff --git a/Documentation/userspace-api/landlock.rst b/Documentation/userspace-api/landlock.rst index 2e38226770615d..8398851964e61c 100644 --- a/Documentation/userspace-api/landlock.rst +++ b/Documentation/userspace-api/landlock.rst @@ -75,7 +75,8 @@ to be explicit about the denied-by-default access rights. LANDLOCK_ACCESS_FS_MAKE_BLOCK | LANDLOCK_ACCESS_FS_MAKE_SYM | LANDLOCK_ACCESS_FS_REFER | - LANDLOCK_ACCESS_FS_TRUNCATE, + LANDLOCK_ACCESS_FS_TRUNCATE | + LANDLOCK_ACCESS_FS_IOCTL, .handled_access_net = LANDLOCK_ACCESS_NET_BIND_TCP | LANDLOCK_ACCESS_NET_CONNECT_TCP, @@ -84,10 +85,10 @@ to be explicit about the denied-by-default access rights. Because we may not know on which kernel version an application will be executed, it is safer to follow a best-effort security approach. Indeed, we should try to protect users as much as possible whatever the kernel they are -using. To avoid binary enforcement (i.e. either all security features or -none), we can leverage a dedicated Landlock command to get the current version -of the Landlock ABI and adapt the handled accesses. Let's check if we should -remove access rights which are only supported in higher versions of the ABI. +using. + +To be compatible with older Linux versions, we detect the available Landlock ABI +version, and only use the available subset of access rights: .. code-block:: c @@ -113,6 +114,10 @@ remove access rights which are only supported in higher versions of the ABI. ruleset_attr.handled_access_net &= ~(LANDLOCK_ACCESS_NET_BIND_TCP | LANDLOCK_ACCESS_NET_CONNECT_TCP); + __attribute__((fallthrough)); + case 4: + /* Removes LANDLOCK_ACCESS_FS_IOCTL for ABI < 5 */ + ruleset_attr.handled_access_fs &= ~LANDLOCK_ACCESS_FS_IOCTL; } This enables to create an inclusive ruleset that will contain our rules. @@ -224,6 +229,7 @@ access rights per directory enables to change the location of such directory without relying on the destination directory access rights (except those that are required for this operation, see ``LANDLOCK_ACCESS_FS_REFER`` documentation). + Having self-sufficient hierarchies also helps to tighten the required access rights to the minimal set of data. This also helps avoid sinkhole directories, i.e. directories where data can be linked to but not linked from. However, @@ -317,18 +323,69 @@ It should also be noted that truncating files does not require the system call, this can also be done through :manpage:`open(2)` with the flags ``O_RDONLY | O_TRUNC``. -When opening a file, the availability of the ``LANDLOCK_ACCESS_FS_TRUNCATE`` -right is associated with the newly created file descriptor and will be used for -subsequent truncation attempts using :manpage:`ftruncate(2)`. The behavior is -similar to opening a file for reading or writing, where permissions are checked -during :manpage:`open(2)`, but not during the subsequent :manpage:`read(2)` and +The truncate right is associated with the opened file (see below). + +Rights associated with file descriptors +--------------------------------------- + +When opening a file, the availability of the ``LANDLOCK_ACCESS_FS_TRUNCATE`` and +``LANDLOCK_ACCESS_FS_IOCTL`` rights is associated with the newly created file +descriptor and will be used for subsequent truncation and ioctl attempts using +:manpage:`ftruncate(2)` and :manpage:`ioctl(2)`. The behavior is similar to +opening a file for reading or writing, where permissions are checked during +:manpage:`open(2)`, but not during the subsequent :manpage:`read(2)` and :manpage:`write(2)` calls. -As a consequence, it is possible to have multiple open file descriptors for the -same file, where one grants the right to truncate the file and the other does -not. It is also possible to pass such file descriptors between processes, -keeping their Landlock properties, even when these processes do not have an -enforced Landlock ruleset. +As a consequence, it is possible that a process has multiple open file +descriptors referring to the same file, but Landlock enforces different things +when operating with these file descriptors. This can happen when a Landlock +ruleset gets enforced and the process keeps file descriptors which were opened +both before and after the enforcement. It is also possible to pass such file +descriptors between processes, keeping their Landlock properties, even when some +of the involved processes do not have an enforced Landlock ruleset. + +Restricting IOCTL commands +-------------------------- + +When the ``LANDLOCK_ACCESS_FS_IOCTL`` access right is handled, Landlock will +restrict the invocation of IOCTL commands. However, to *permit* these IOCTL +commands again, some of these IOCTL commands are then granted through other, +preexisting access rights. + +For example, consider a program which handles ``LANDLOCK_ACCESS_FS_IOCTL`` and +``LANDLOCK_ACCESS_FS_READ_FILE``. The program *permits* +``LANDLOCK_ACCESS_FS_READ_FILE`` on a file ``foo.log``. + +By virtue of granting this access on the ``foo.log`` file, it is now possible to +use common and harmless IOCTL commands which are useful when reading files, such +as ``FIONREAD``. + +On the other hand, if the program permits ``LANDLOCK_ACCESS_FS_IOCTL`` on +another file, ``FIONREAD`` will not work on that file when it is opened. As +soon as ``LANDLOCK_ACCESS_FS_READ_FILE`` is *handled* in the ruleset, the IOCTL +commands affected by it can not be reenabled though ``LANDLOCK_ACCESS_FS_IOCTL`` +any more, but are then governed by ``LANDLOCK_ACCESS_FS_READ_FILE``. + +The following table illustrates how IOCTL attempts for ``FIONREAD`` are +filtered, depending on how a Landlock ruleset handles and permits the +``LANDLOCK_ACCESS_FS_IOCTL`` and ``LANDLOCK_ACCESS_FS_READ_FILE`` access rights: + ++------------------------+-------------+-------------------+-------------------+ +| | ``IOCTL`` | ``IOCTL`` handled | ``IOCTL`` handled | +| | not handled | and permitted | and not permitted | ++------------------------+-------------+-------------------+-------------------+ +| ``READ_FILE`` not | allow | allow | deny | +| handled | | | | ++------------------------+ +-------------------+-------------------+ +| ``READ_FILE`` handled | | allow | +| and permitted | | | ++------------------------+ +-------------------+-------------------+ +| ``READ_FILE`` handled | | deny | +| and not permitted | | | ++------------------------+-------------+-------------------+-------------------+ + +The full list of IOCTL commands and the access rights which affect them is +documented below. Compatibility ============= @@ -457,6 +514,28 @@ Memory usage Kernel memory allocated to create rulesets is accounted and can be restricted by the Documentation/admin-guide/cgroup-v1/memory.rst. +IOCTL support +------------- + +The ``LANDLOCK_ACCESS_FS_IOCTL`` access right restricts the use of +:manpage:`ioctl(2)`, but it only applies to newly opened files. This means +specifically that pre-existing file descriptors like stdin, stdout and stderr +are unaffected. + +Users should be aware that TTY devices have traditionally permitted to control +other processes on the same TTY through the ``TIOCSTI`` and ``TIOCLINUX`` IOCTL +commands. It is therefore recommended to close inherited TTY file descriptors, +or to reopen them from ``/proc/self/fd/*`` without the +``LANDLOCK_ACCESS_FS_IOCTL`` right, if possible. The :manpage:`isatty(3)` +function checks whether a given file descriptor is a TTY. + +Landlock's IOCTL support is coarse-grained at the moment, but may become more +fine-grained in the future. Until then, users are advised to establish the +guarantees that they need through the file hierarchy, by only permitting the +``LANDLOCK_ACCESS_FS_IOCTL`` right on files where it is really harmless. In +cases where you can control the mounts, the ``nodev`` mount option can help to +rule out that device files can be accessed. + Previous limitations ==================== @@ -494,6 +573,16 @@ bind and connect actions to only a set of allowed ports thanks to the new ``LANDLOCK_ACCESS_NET_BIND_TCP`` and ``LANDLOCK_ACCESS_NET_CONNECT_TCP`` access rights. +IOCTL (ABI < 5) +--------------- + +IOCTL operations could not be denied before the fifth Landlock ABI, so +:manpage:`ioctl(2)` is always allowed when using a kernel that only supports an +earlier ABI. + +Starting with the Landlock ABI version 5, it is possible to restrict the use of +:manpage:`ioctl(2)` using the new ``LANDLOCK_ACCESS_FS_IOCTL`` access right. + .. _kernel_support: Kernel support From 6f351af0c85fdbc9fe184bd9744b1b3caf4e9431 Mon Sep 17 00:00:00 2001 From: Kunwu Chan Date: Wed, 31 Jan 2024 15:09:41 +0800 Subject: [PATCH 633/707] fs: Use KMEM_CACHE instead of kmem_cache_create commit 0a31bd5f2bbb ("KMEM_CACHE(): simplify slab cache creation") introduces a new macro. Use the new KMEM_CACHE() macro instead of direct kmem_cache_create to simplify the creation of SLAB caches. Signed-off-by: Kunwu Chan Link: https://lore.kernel.org/r/20240131070941.135178-1-chentao@kylinos.cn Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/backing-file.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/backing-file.c b/fs/backing-file.c index a681f38d84d8e1..740185198db347 100644 --- a/fs/backing-file.c +++ b/fs/backing-file.c @@ -325,9 +325,7 @@ EXPORT_SYMBOL_GPL(backing_file_mmap); static int __init backing_aio_init(void) { - backing_aio_cachep = kmem_cache_create("backing_aio", - sizeof(struct backing_aio), - 0, SLAB_HWCACHE_ALIGN, NULL); + backing_aio_cachep = KMEM_CACHE(backing_aio, SLAB_HWCACHE_ALIGN); if (!backing_aio_cachep) return -ENOMEM; From b473491b6cf8063c040e6863b74f6d31cabf52f8 Mon Sep 17 00:00:00 2001 From: Wang Jinchao Date: Wed, 31 Jan 2024 10:54:41 +0800 Subject: [PATCH 634/707] fork: Using clone_flags for legacy clone check In the current implementation of clone(), there is a line that initializes `u64 clone_flags = args->flags` at the top. This means that there is no longer a need to use args->flags for the legacy clone check. Signed-off-by: Wang Jinchao Link: https://lore.kernel.org/r/202401311054+0800-wangjinchao@xfusion.com Signed-off-by: Christian Brauner --- kernel/fork.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/fork.c b/kernel/fork.c index 34704d277b68a5..726a92043531ff 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2872,8 +2872,8 @@ pid_t kernel_clone(struct kernel_clone_args *args) * here has the advantage that we don't need to have a separate helper * to check for legacy clone(). */ - if ((args->flags & CLONE_PIDFD) && - (args->flags & CLONE_PARENT_SETTID) && + if ((clone_flags & CLONE_PIDFD) && + (clone_flags & CLONE_PARENT_SETTID) && (args->pidfd == args->parent_tid)) return -EINVAL; From 25639e92cc75059375ea7cb8bcbf1b678feaef9b Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 30 Jan 2024 12:08:21 +1100 Subject: [PATCH 635/707] nfsd: remove stale comment in nfs4_show_deleg() As we do now support write delegations, this comment is unhelpful and misleading. Reported-by: Jeff Layton Signed-off-by: NeilBrown Signed-off-by: Chuck Lever --- fs/nfsd/nfs4state.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index ae9b5a3a585f96..5e640e9945cd6b 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -2711,7 +2711,6 @@ static int nfs4_show_deleg(struct seq_file *s, struct nfs4_stid *st) nfs4_show_stateid(s, &st->sc_stateid); seq_printf(s, ": { type: deleg, "); - /* Kinda dead code as long as we only support read delegs: */ seq_printf(s, "access: %s, ", ds->dl_type == NFS4_OPEN_DELEGATE_READ ? "r" : "w"); From 2f9ed7d34aa56e75e449b4a9e1079eaf77a501e3 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 30 Jan 2024 12:08:22 +1100 Subject: [PATCH 636/707] nfsd: hold ->cl_lock for hash_delegation_locked() The protocol for creating a new state in nfsd is to allocate the state leaving it largely uninitialised, add that state to the ->cl_stateids idr so as to reserve a state-id, then complete initialisation of the state and only set ->sc_type to non-zero once the state is fully initialised. If a state is found in the idr with ->sc_type == 0, it is ignored. The ->cl_lock lock is used to avoid races - it is held while checking sc_type during lookup, and held when a non-zero value is stored in ->sc_type. ... except... hash_delegation_locked() finalises the initialisation of a delegation state, but does NOT hold ->cl_lock. So this patch takes ->cl_lock at the appropriate time w.r.t other locks, and so ensures there are no races (which are extremely unlikely in any case). As ->fi_lock is often taken when ->cl_lock is held, we need to take ->cl_lock first of those two. Currently ->cl_lock and state_lock are never both taken at the same time. We need both for this patch so an arbitrary choice is needed concerning which to take first. As state_lock is more global, it might be more contended, so take it first. Reviewed-by: Jeff Layton Signed-off-by: NeilBrown Signed-off-by: Chuck Lever --- fs/nfsd/nfs4state.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 5e640e9945cd6b..ae00f9327245e5 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1312,6 +1312,7 @@ hash_delegation_locked(struct nfs4_delegation *dp, struct nfs4_file *fp) lockdep_assert_held(&state_lock); lockdep_assert_held(&fp->fi_lock); + lockdep_assert_held(&clp->cl_lock); if (nfs4_delegation_exists(clp, fp)) return -EAGAIN; @@ -5558,12 +5559,14 @@ nfs4_set_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp, goto out_unlock; spin_lock(&state_lock); + spin_lock(&clp->cl_lock); spin_lock(&fp->fi_lock); if (fp->fi_had_conflict) status = -EAGAIN; else status = hash_delegation_locked(dp, fp); spin_unlock(&fp->fi_lock); + spin_unlock(&clp->cl_lock); spin_unlock(&state_lock); if (status) From 05d89966ab3eabc4394068ebb3bf5720b01a315f Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 30 Jan 2024 12:08:23 +1100 Subject: [PATCH 637/707] nfsd: don't call functions with side-effecting inside WARN_ON() Code like: WARN_ON(foo()) looks like an assertion and might not be expected to have any side effects. When testing if a function with side-effects fails a construct like if (foo()) WARN_ON(1); makes the intent more obvious. nfsd has several WARN_ON calls where the test has side effects, so it would be good to change them. These cases don't really need the WARN_ON. They have never failed in 8 years of usage so let's just remove the WARN_ON wrapper. Suggested-by: Chuck Lever Reviewed-by: Jeff Layton Signed-off-by: NeilBrown Signed-off-by: Chuck Lever --- fs/nfsd/nfs4state.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index ae00f9327245e5..59982fa5d4fafe 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1600,7 +1600,7 @@ static void release_open_stateid_locks(struct nfs4_ol_stateid *open_stp, while (!list_empty(&open_stp->st_locks)) { stp = list_entry(open_stp->st_locks.next, struct nfs4_ol_stateid, st_locks); - WARN_ON(!unhash_lock_stateid(stp)); + unhash_lock_stateid(stp); put_ol_stateid_locked(stp, reaplist); } } @@ -2229,7 +2229,7 @@ __destroy_client(struct nfs4_client *clp) spin_lock(&state_lock); while (!list_empty(&clp->cl_delegations)) { dp = list_entry(clp->cl_delegations.next, struct nfs4_delegation, dl_perclnt); - WARN_ON(!unhash_delegation_locked(dp)); + unhash_delegation_locked(dp); list_add(&dp->dl_recall_lru, &reaplist); } spin_unlock(&state_lock); @@ -6170,7 +6170,7 @@ nfs4_laundromat(struct nfsd_net *nn) dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); if (!state_expired(<, dp->dl_time)) break; - WARN_ON(!unhash_delegation_locked(dp)); + unhash_delegation_locked(dp); list_add(&dp->dl_recall_lru, &reaplist); } spin_unlock(&state_lock); @@ -8010,7 +8010,7 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp, stp = list_first_entry(&lo->lo_owner.so_stateids, struct nfs4_ol_stateid, st_perstateowner); - WARN_ON(!unhash_lock_stateid(stp)); + unhash_lock_stateid(stp); put_ol_stateid_locked(stp, &reaplist); } spin_unlock(&clp->cl_lock); @@ -8303,7 +8303,7 @@ nfs4_state_shutdown_net(struct net *net) spin_lock(&state_lock); list_for_each_safe(pos, next, &nn->del_recall_lru) { dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); - WARN_ON(!unhash_delegation_locked(dp)); + unhash_delegation_locked(dp); list_add(&dp->dl_recall_lru, &reaplist); } spin_unlock(&state_lock); From 176e7ce14e1a2c908dde39b9f38e7e78ae5c594e Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 30 Jan 2024 12:08:24 +1100 Subject: [PATCH 638/707] nfsd: avoid race after unhash_delegation_locked() NFS4_CLOSED_DELEG_STID and NFS4_REVOKED_DELEG_STID are similar in purpose. REVOKED is used for NFSv4.1 states which have been revoked because the lease has expired. CLOSED is used in other cases. The difference has two practical effects. 1/ REVOKED states are on the ->cl_revoked list 2/ REVOKED states result in nfserr_deleg_revoked from nfsd4_verify_open_stid() and nfsd4_validate_stateid while CLOSED states result in nfserr_bad_stid. Currently a state that is being revoked is first set to "CLOSED" in unhash_delegation_locked(), then possibly to "REVOKED" in revoke_delegation(), at which point it is added to the cl_revoked list. It is possible that a stateid test could see the CLOSED state which really should be REVOKED, and so return the wrong error code. So it is safest to remove this window of inconsistency. With this patch, unhash_delegation_locked() always sets the state correctly, and revoke_delegation() no longer changes the state. Also remove a redundant test on minorversion when NFS4_REVOKED_DELEG_STID is seen - it can only be seen when minorversion is non-zero. Reviewed-by: Jeff Layton Signed-off-by: NeilBrown Signed-off-by: Chuck Lever --- fs/nfsd/nfs4state.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 59982fa5d4fafe..3527b9388174bb 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1329,7 +1329,7 @@ static bool delegation_hashed(struct nfs4_delegation *dp) } static bool -unhash_delegation_locked(struct nfs4_delegation *dp) +unhash_delegation_locked(struct nfs4_delegation *dp, unsigned char type) { struct nfs4_file *fp = dp->dl_stid.sc_file; @@ -1338,7 +1338,9 @@ unhash_delegation_locked(struct nfs4_delegation *dp) if (!delegation_hashed(dp)) return false; - dp->dl_stid.sc_type = NFS4_CLOSED_DELEG_STID; + if (dp->dl_stid.sc_client->cl_minorversion == 0) + type = NFS4_CLOSED_DELEG_STID; + dp->dl_stid.sc_type = type; /* Ensure that deleg break won't try to requeue it */ ++dp->dl_time; spin_lock(&fp->fi_lock); @@ -1354,7 +1356,7 @@ static void destroy_delegation(struct nfs4_delegation *dp) bool unhashed; spin_lock(&state_lock); - unhashed = unhash_delegation_locked(dp); + unhashed = unhash_delegation_locked(dp, NFS4_CLOSED_DELEG_STID); spin_unlock(&state_lock); if (unhashed) destroy_unhashed_deleg(dp); @@ -1368,9 +1370,8 @@ static void revoke_delegation(struct nfs4_delegation *dp) trace_nfsd_stid_revoke(&dp->dl_stid); - if (clp->cl_minorversion) { + if (dp->dl_stid.sc_type == NFS4_REVOKED_DELEG_STID) { spin_lock(&clp->cl_lock); - dp->dl_stid.sc_type = NFS4_REVOKED_DELEG_STID; refcount_inc(&dp->dl_stid.sc_count); list_add(&dp->dl_recall_lru, &clp->cl_revoked); spin_unlock(&clp->cl_lock); @@ -2229,7 +2230,7 @@ __destroy_client(struct nfs4_client *clp) spin_lock(&state_lock); while (!list_empty(&clp->cl_delegations)) { dp = list_entry(clp->cl_delegations.next, struct nfs4_delegation, dl_perclnt); - unhash_delegation_locked(dp); + unhash_delegation_locked(dp, NFS4_CLOSED_DELEG_STID); list_add(&dp->dl_recall_lru, &reaplist); } spin_unlock(&state_lock); @@ -5146,8 +5147,7 @@ nfs4_check_deleg(struct nfs4_client *cl, struct nfsd4_open *open, goto out; if (deleg->dl_stid.sc_type == NFS4_REVOKED_DELEG_STID) { nfs4_put_stid(&deleg->dl_stid); - if (cl->cl_minorversion) - status = nfserr_deleg_revoked; + status = nfserr_deleg_revoked; goto out; } flags = share_access_to_flags(open->op_share_access); @@ -6170,7 +6170,7 @@ nfs4_laundromat(struct nfsd_net *nn) dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); if (!state_expired(<, dp->dl_time)) break; - unhash_delegation_locked(dp); + unhash_delegation_locked(dp, NFS4_REVOKED_DELEG_STID); list_add(&dp->dl_recall_lru, &reaplist); } spin_unlock(&state_lock); @@ -8303,7 +8303,7 @@ nfs4_state_shutdown_net(struct net *net) spin_lock(&state_lock); list_for_each_safe(pos, next, &nn->del_recall_lru) { dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); - unhash_delegation_locked(dp); + unhash_delegation_locked(dp, NFS4_CLOSED_DELEG_STID); list_add(&dp->dl_recall_lru, &reaplist); } spin_unlock(&state_lock); From 7aa3cc8a10c7cefe58251c7a1febec8f1406b36c Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 30 Jan 2024 12:08:25 +1100 Subject: [PATCH 639/707] nfsd: split sc_status out of sc_type sc_type identifies the type of a state - open, lock, deleg, layout - and also the status of a state - closed or revoked. This is a bit untidy and could get worse when "admin-revoked" states are added. So clean it up. With this patch, the type is now all that is stored in sc_type. This is zero when the state is first added to ->cl_stateids (causing it to be ignored), and is then set appropriately once it is fully initialised. It is set under ->cl_lock to ensure atomicity w.r.t lookup. It is now never cleared. sc_type is still a bit-set even though at most one bit is set. This allows lookup functions to be given a bitmap of acceptable types. sc_type is now an unsigned short rather than char. There is no value in restricting to just 8 bits. All the constants now start SC_TYPE_ matching the field in which they are stored. Keeping the existing names and ensuring clear separation from non-type flags would have required something like NFS4_STID_TYPE_CLOSED which is cumbersome. The "NFS4" prefix is redundant was they only appear in NFS4 code, so remove that and change STID to SC to match the field. The status is stored in a separate unsigned short named "sc_status". It has two flags: SC_STATUS_CLOSED and SC_STATUS_REVOKED. CLOSED combines NFS4_CLOSED_STID, NFS4_CLOSED_DELEG_STID, and is used for SC_TYPE_LOCK and SC_TYPE_LAYOUT instead of setting the sc_type to zero. These flags are only ever set, never cleared. For deleg stateids they are set under the global state_lock. For open and lock stateids they are set under ->cl_lock. For layout stateids they are set under ->ls_lock nfs4_unhash_stid() has been removed, and we never set sc_type = 0. This was only used for LOCK and LAYOUT stids and they now use SC_STATUS_CLOSED. Also TRACE_DEFINE_NUM() calls for the various STID #define have been removed because these things are not enums, and so that call is incorrect. Reviewed-by: Jeff Layton Signed-off-by: NeilBrown Signed-off-by: Chuck Lever --- fs/nfsd/nfs4layouts.c | 14 +-- fs/nfsd/nfs4state.c | 207 +++++++++++++++++++++--------------------- fs/nfsd/state.h | 40 +++++--- fs/nfsd/trace.h | 31 +++---- 4 files changed, 151 insertions(+), 141 deletions(-) diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c index 5e8096bc5eaa45..857b822450b4fe 100644 --- a/fs/nfsd/nfs4layouts.c +++ b/fs/nfsd/nfs4layouts.c @@ -236,7 +236,7 @@ nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate, nfsd4_init_cb(&ls->ls_recall, clp, &nfsd4_cb_layout_ops, NFSPROC4_CLNT_CB_LAYOUT); - if (parent->sc_type == NFS4_DELEG_STID) + if (parent->sc_type == SC_TYPE_DELEG) ls->ls_file = nfsd_file_get(fp->fi_deleg_file); else ls->ls_file = find_any_file(fp); @@ -250,7 +250,7 @@ nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate, } spin_lock(&clp->cl_lock); - stp->sc_type = NFS4_LAYOUT_STID; + stp->sc_type = SC_TYPE_LAYOUT; list_add(&ls->ls_perclnt, &clp->cl_lo_states); spin_unlock(&clp->cl_lock); @@ -269,13 +269,13 @@ nfsd4_preprocess_layout_stateid(struct svc_rqst *rqstp, { struct nfs4_layout_stateid *ls; struct nfs4_stid *stid; - unsigned char typemask = NFS4_LAYOUT_STID; + unsigned short typemask = SC_TYPE_LAYOUT; __be32 status; if (create) - typemask |= (NFS4_OPEN_STID | NFS4_LOCK_STID | NFS4_DELEG_STID); + typemask |= (SC_TYPE_OPEN | SC_TYPE_LOCK | SC_TYPE_DELEG); - status = nfsd4_lookup_stateid(cstate, stateid, typemask, &stid, + status = nfsd4_lookup_stateid(cstate, stateid, typemask, 0, &stid, net_generic(SVC_NET(rqstp), nfsd_net_id)); if (status) goto out; @@ -286,7 +286,7 @@ nfsd4_preprocess_layout_stateid(struct svc_rqst *rqstp, goto out_put_stid; } - if (stid->sc_type != NFS4_LAYOUT_STID) { + if (stid->sc_type != SC_TYPE_LAYOUT) { ls = nfsd4_alloc_layout_stateid(cstate, stid, layout_type); nfs4_put_stid(stid); @@ -518,7 +518,7 @@ nfsd4_return_file_layouts(struct svc_rqst *rqstp, lrp->lrs_present = true; } else { trace_nfsd_layoutstate_unhash(&ls->ls_stid.sc_stateid); - nfs4_unhash_stid(&ls->ls_stid); + ls->ls_stid.sc_status |= SC_STATUS_CLOSED; lrp->lrs_present = false; } spin_unlock(&ls->ls_lock); diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 3527b9388174bb..58096ec81fb979 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1260,11 +1260,6 @@ static void destroy_unhashed_deleg(struct nfs4_delegation *dp) nfs4_put_stid(&dp->dl_stid); } -void nfs4_unhash_stid(struct nfs4_stid *s) -{ - s->sc_type = 0; -} - /** * nfs4_delegation_exists - Discover if this delegation already exists * @clp: a pointer to the nfs4_client we're granting a delegation to @@ -1317,7 +1312,7 @@ hash_delegation_locked(struct nfs4_delegation *dp, struct nfs4_file *fp) if (nfs4_delegation_exists(clp, fp)) return -EAGAIN; refcount_inc(&dp->dl_stid.sc_count); - dp->dl_stid.sc_type = NFS4_DELEG_STID; + dp->dl_stid.sc_type = SC_TYPE_DELEG; list_add(&dp->dl_perfile, &fp->fi_delegations); list_add(&dp->dl_perclnt, &clp->cl_delegations); return 0; @@ -1329,7 +1324,7 @@ static bool delegation_hashed(struct nfs4_delegation *dp) } static bool -unhash_delegation_locked(struct nfs4_delegation *dp, unsigned char type) +unhash_delegation_locked(struct nfs4_delegation *dp, unsigned short statusmask) { struct nfs4_file *fp = dp->dl_stid.sc_file; @@ -1339,8 +1334,9 @@ unhash_delegation_locked(struct nfs4_delegation *dp, unsigned char type) return false; if (dp->dl_stid.sc_client->cl_minorversion == 0) - type = NFS4_CLOSED_DELEG_STID; - dp->dl_stid.sc_type = type; + statusmask = SC_STATUS_CLOSED; + dp->dl_stid.sc_status |= statusmask; + /* Ensure that deleg break won't try to requeue it */ ++dp->dl_time; spin_lock(&fp->fi_lock); @@ -1356,7 +1352,7 @@ static void destroy_delegation(struct nfs4_delegation *dp) bool unhashed; spin_lock(&state_lock); - unhashed = unhash_delegation_locked(dp, NFS4_CLOSED_DELEG_STID); + unhashed = unhash_delegation_locked(dp, SC_STATUS_CLOSED); spin_unlock(&state_lock); if (unhashed) destroy_unhashed_deleg(dp); @@ -1370,7 +1366,7 @@ static void revoke_delegation(struct nfs4_delegation *dp) trace_nfsd_stid_revoke(&dp->dl_stid); - if (dp->dl_stid.sc_type == NFS4_REVOKED_DELEG_STID) { + if (dp->dl_stid.sc_status & SC_STATUS_REVOKED) { spin_lock(&clp->cl_lock); refcount_inc(&dp->dl_stid.sc_count); list_add(&dp->dl_recall_lru, &clp->cl_revoked); @@ -1379,8 +1375,8 @@ static void revoke_delegation(struct nfs4_delegation *dp) destroy_unhashed_deleg(dp); } -/* - * SETCLIENTID state +/* + * SETCLIENTID state */ static unsigned int clientid_hashval(u32 id) @@ -1543,7 +1539,7 @@ static bool unhash_lock_stateid(struct nfs4_ol_stateid *stp) if (!unhash_ol_stateid(stp)) return false; list_del_init(&stp->st_locks); - nfs4_unhash_stid(&stp->st_stid); + stp->st_stid.sc_status |= SC_STATUS_CLOSED; return true; } @@ -1622,6 +1618,7 @@ static void release_open_stateid(struct nfs4_ol_stateid *stp) LIST_HEAD(reaplist); spin_lock(&stp->st_stid.sc_client->cl_lock); + stp->st_stid.sc_status |= SC_STATUS_CLOSED; if (unhash_open_stateid(stp, &reaplist)) put_ol_stateid_locked(stp, &reaplist); spin_unlock(&stp->st_stid.sc_client->cl_lock); @@ -2230,7 +2227,7 @@ __destroy_client(struct nfs4_client *clp) spin_lock(&state_lock); while (!list_empty(&clp->cl_delegations)) { dp = list_entry(clp->cl_delegations.next, struct nfs4_delegation, dl_perclnt); - unhash_delegation_locked(dp, NFS4_CLOSED_DELEG_STID); + unhash_delegation_locked(dp, SC_STATUS_CLOSED); list_add(&dp->dl_recall_lru, &reaplist); } spin_unlock(&state_lock); @@ -2462,14 +2459,16 @@ find_stateid_locked(struct nfs4_client *cl, stateid_t *t) } static struct nfs4_stid * -find_stateid_by_type(struct nfs4_client *cl, stateid_t *t, char typemask) +find_stateid_by_type(struct nfs4_client *cl, stateid_t *t, + unsigned short typemask, unsigned short ok_states) { struct nfs4_stid *s; spin_lock(&cl->cl_lock); s = find_stateid_locked(cl, t); if (s != NULL) { - if (typemask & s->sc_type) + if ((s->sc_status & ~ok_states) == 0 && + (typemask & s->sc_type)) refcount_inc(&s->sc_count); else s = NULL; @@ -2622,7 +2621,7 @@ static int nfs4_show_open(struct seq_file *s, struct nfs4_stid *st) struct nfs4_stateowner *oo; unsigned int access, deny; - if (st->sc_type != NFS4_OPEN_STID && st->sc_type != NFS4_LOCK_STID) + if (st->sc_type != SC_TYPE_OPEN && st->sc_type != SC_TYPE_LOCK) return 0; /* XXX: or SEQ_SKIP? */ ols = openlockstateid(st); oo = ols->st_stateowner; @@ -2754,13 +2753,13 @@ static int states_show(struct seq_file *s, void *v) struct nfs4_stid *st = v; switch (st->sc_type) { - case NFS4_OPEN_STID: + case SC_TYPE_OPEN: return nfs4_show_open(s, st); - case NFS4_LOCK_STID: + case SC_TYPE_LOCK: return nfs4_show_lock(s, st); - case NFS4_DELEG_STID: + case SC_TYPE_DELEG: return nfs4_show_deleg(s, st); - case NFS4_LAYOUT_STID: + case SC_TYPE_LAYOUT: return nfs4_show_layout(s, st); default: return 0; /* XXX: or SEQ_SKIP? */ @@ -4533,7 +4532,8 @@ nfsd4_find_existing_open(struct nfs4_file *fp, struct nfsd4_open *open) continue; if (local->st_stateowner != &oo->oo_owner) continue; - if (local->st_stid.sc_type == NFS4_OPEN_STID) { + if (local->st_stid.sc_type == SC_TYPE_OPEN && + !local->st_stid.sc_status) { ret = local; refcount_inc(&ret->st_stid.sc_count); break; @@ -4547,17 +4547,10 @@ nfsd4_verify_open_stid(struct nfs4_stid *s) { __be32 ret = nfs_ok; - switch (s->sc_type) { - default: - break; - case 0: - case NFS4_CLOSED_STID: - case NFS4_CLOSED_DELEG_STID: - ret = nfserr_bad_stateid; - break; - case NFS4_REVOKED_DELEG_STID: + if (s->sc_status & SC_STATUS_REVOKED) ret = nfserr_deleg_revoked; - } + else if (s->sc_status & SC_STATUS_CLOSED) + ret = nfserr_bad_stateid; return ret; } @@ -4643,7 +4636,7 @@ init_open_stateid(struct nfs4_file *fp, struct nfsd4_open *open) open->op_stp = NULL; refcount_inc(&stp->st_stid.sc_count); - stp->st_stid.sc_type = NFS4_OPEN_STID; + stp->st_stid.sc_type = SC_TYPE_OPEN; INIT_LIST_HEAD(&stp->st_locks); stp->st_stateowner = nfs4_get_stateowner(&oo->oo_owner); get_nfs4_file(fp); @@ -4870,9 +4863,9 @@ static int nfsd4_cb_recall_done(struct nfsd4_callback *cb, trace_nfsd_cb_recall_done(&dp->dl_stid.sc_stateid, task); - if (dp->dl_stid.sc_type == NFS4_CLOSED_DELEG_STID || - dp->dl_stid.sc_type == NFS4_REVOKED_DELEG_STID) - return 1; + if (dp->dl_stid.sc_status) + /* CLOSED or REVOKED */ + return 1; switch (task->tk_status) { case 0: @@ -5117,12 +5110,12 @@ static int share_access_to_flags(u32 share_access) return share_access == NFS4_SHARE_ACCESS_READ ? RD_STATE : WR_STATE; } -static struct nfs4_delegation *find_deleg_stateid(struct nfs4_client *cl, stateid_t *s) +static struct nfs4_delegation *find_deleg_stateid(struct nfs4_client *cl, + stateid_t *s) { struct nfs4_stid *ret; - ret = find_stateid_by_type(cl, s, - NFS4_DELEG_STID|NFS4_REVOKED_DELEG_STID); + ret = find_stateid_by_type(cl, s, SC_TYPE_DELEG, SC_STATUS_REVOKED); if (!ret) return NULL; return delegstateid(ret); @@ -5145,7 +5138,7 @@ nfs4_check_deleg(struct nfs4_client *cl, struct nfsd4_open *open, deleg = find_deleg_stateid(cl, &open->op_delegate_stateid); if (deleg == NULL) goto out; - if (deleg->dl_stid.sc_type == NFS4_REVOKED_DELEG_STID) { + if (deleg->dl_stid.sc_status & SC_STATUS_REVOKED) { nfs4_put_stid(&deleg->dl_stid); status = nfserr_deleg_revoked; goto out; @@ -5778,7 +5771,6 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf } else { status = nfs4_get_vfs_file(rqstp, fp, current_fh, stp, open, true); if (status) { - stp->st_stid.sc_type = NFS4_CLOSED_STID; release_open_stateid(stp); mutex_unlock(&stp->st_mutex); goto out; @@ -6170,7 +6162,7 @@ nfs4_laundromat(struct nfsd_net *nn) dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); if (!state_expired(<, dp->dl_time)) break; - unhash_delegation_locked(dp, NFS4_REVOKED_DELEG_STID); + unhash_delegation_locked(dp, SC_STATUS_REVOKED); list_add(&dp->dl_recall_lru, &reaplist); } spin_unlock(&state_lock); @@ -6409,22 +6401,20 @@ static __be32 nfsd4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid) status = nfsd4_stid_check_stateid_generation(stateid, s, 1); if (status) goto out_unlock; + status = nfsd4_verify_open_stid(s); + if (status) + goto out_unlock; + switch (s->sc_type) { - case NFS4_DELEG_STID: + case SC_TYPE_DELEG: status = nfs_ok; break; - case NFS4_REVOKED_DELEG_STID: - status = nfserr_deleg_revoked; - break; - case NFS4_OPEN_STID: - case NFS4_LOCK_STID: + case SC_TYPE_OPEN: + case SC_TYPE_LOCK: status = nfsd4_check_openowner_confirmed(openlockstateid(s)); break; default: printk("unknown stateid type %x\n", s->sc_type); - fallthrough; - case NFS4_CLOSED_STID: - case NFS4_CLOSED_DELEG_STID: status = nfserr_bad_stateid; } out_unlock: @@ -6434,7 +6424,8 @@ static __be32 nfsd4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid) __be32 nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate, - stateid_t *stateid, unsigned char typemask, + stateid_t *stateid, + unsigned short typemask, unsigned short statusmask, struct nfs4_stid **s, struct nfsd_net *nn) { __be32 status; @@ -6445,10 +6436,13 @@ nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate, * only return revoked delegations if explicitly asked. * otherwise we report revoked or bad_stateid status. */ - if (typemask & NFS4_REVOKED_DELEG_STID) + if (statusmask & SC_STATUS_REVOKED) return_revoked = true; - else if (typemask & NFS4_DELEG_STID) - typemask |= NFS4_REVOKED_DELEG_STID; + if (typemask & SC_TYPE_DELEG) + /* Always allow REVOKED for DELEG so we can + * retturn the appropriate error. + */ + statusmask |= SC_STATUS_REVOKED; if (ZERO_STATEID(stateid) || ONE_STATEID(stateid) || CLOSE_STATEID(stateid)) @@ -6461,14 +6455,12 @@ nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate, } if (status) return status; - stid = find_stateid_by_type(cstate->clp, stateid, typemask); + stid = find_stateid_by_type(cstate->clp, stateid, typemask, statusmask); if (!stid) return nfserr_bad_stateid; - if ((stid->sc_type == NFS4_REVOKED_DELEG_STID) && !return_revoked) { + if ((stid->sc_status & SC_STATUS_REVOKED) && !return_revoked) { nfs4_put_stid(stid); - if (cstate->minorversion) - return nfserr_deleg_revoked; - return nfserr_bad_stateid; + return nfserr_deleg_revoked; } *s = stid; return nfs_ok; @@ -6479,17 +6471,17 @@ nfs4_find_file(struct nfs4_stid *s, int flags) { struct nfsd_file *ret = NULL; - if (!s) + if (!s || s->sc_status) return NULL; switch (s->sc_type) { - case NFS4_DELEG_STID: + case SC_TYPE_DELEG: spin_lock(&s->sc_file->fi_lock); ret = nfsd_file_get(s->sc_file->fi_deleg_file); spin_unlock(&s->sc_file->fi_lock); break; - case NFS4_OPEN_STID: - case NFS4_LOCK_STID: + case SC_TYPE_OPEN: + case SC_TYPE_LOCK: if (flags & RD_STATE) ret = find_readable_file(s->sc_file); else @@ -6602,7 +6594,8 @@ static __be32 find_cpntf_state(struct nfsd_net *nn, stateid_t *st, goto out; *stid = find_stateid_by_type(found, &cps->cp_p_stateid, - NFS4_DELEG_STID|NFS4_OPEN_STID|NFS4_LOCK_STID); + SC_TYPE_DELEG|SC_TYPE_OPEN|SC_TYPE_LOCK, + 0); if (*stid) status = nfs_ok; else @@ -6659,8 +6652,8 @@ nfs4_preprocess_stateid_op(struct svc_rqst *rqstp, } status = nfsd4_lookup_stateid(cstate, stateid, - NFS4_DELEG_STID|NFS4_OPEN_STID|NFS4_LOCK_STID, - &s, nn); + SC_TYPE_DELEG|SC_TYPE_OPEN|SC_TYPE_LOCK, + 0, &s, nn); if (status == nfserr_bad_stateid) status = find_cpntf_state(nn, stateid, &s); if (status) @@ -6671,16 +6664,13 @@ nfs4_preprocess_stateid_op(struct svc_rqst *rqstp, goto out; switch (s->sc_type) { - case NFS4_DELEG_STID: + case SC_TYPE_DELEG: status = nfs4_check_delegmode(delegstateid(s), flags); break; - case NFS4_OPEN_STID: - case NFS4_LOCK_STID: + case SC_TYPE_OPEN: + case SC_TYPE_LOCK: status = nfs4_check_olstateid(openlockstateid(s), flags); break; - default: - status = nfserr_bad_stateid; - break; } if (status) goto out; @@ -6759,33 +6749,34 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, spin_lock(&cl->cl_lock); s = find_stateid_locked(cl, stateid); - if (!s) + if (!s || s->sc_status & SC_STATUS_CLOSED) goto out_unlock; spin_lock(&s->sc_lock); switch (s->sc_type) { - case NFS4_DELEG_STID: + case SC_TYPE_DELEG: + if (s->sc_status & SC_STATUS_REVOKED) { + spin_unlock(&s->sc_lock); + dp = delegstateid(s); + list_del_init(&dp->dl_recall_lru); + spin_unlock(&cl->cl_lock); + nfs4_put_stid(s); + ret = nfs_ok; + goto out; + } ret = nfserr_locks_held; break; - case NFS4_OPEN_STID: + case SC_TYPE_OPEN: ret = check_stateid_generation(stateid, &s->sc_stateid, 1); if (ret) break; ret = nfserr_locks_held; break; - case NFS4_LOCK_STID: + case SC_TYPE_LOCK: spin_unlock(&s->sc_lock); refcount_inc(&s->sc_count); spin_unlock(&cl->cl_lock); ret = nfsd4_free_lock_stateid(stateid, s); goto out; - case NFS4_REVOKED_DELEG_STID: - spin_unlock(&s->sc_lock); - dp = delegstateid(s); - list_del_init(&dp->dl_recall_lru); - spin_unlock(&cl->cl_lock); - nfs4_put_stid(s); - ret = nfs_ok; - goto out; /* Default falls through and returns nfserr_bad_stateid */ } spin_unlock(&s->sc_lock); @@ -6828,6 +6819,7 @@ static __be32 nfs4_seqid_op_checks(struct nfsd4_compound_state *cstate, stateid_ * @seqid: seqid (provided by client) * @stateid: stateid (provided by client) * @typemask: mask of allowable types for this operation + * @statusmask: mask of allowed states: 0 or STID_CLOSED * @stpp: return pointer for the stateid found * @nn: net namespace for request * @@ -6837,7 +6829,8 @@ static __be32 nfs4_seqid_op_checks(struct nfsd4_compound_state *cstate, stateid_ */ static __be32 nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, - stateid_t *stateid, char typemask, + stateid_t *stateid, + unsigned short typemask, unsigned short statusmask, struct nfs4_ol_stateid **stpp, struct nfsd_net *nn) { @@ -6848,7 +6841,8 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, trace_nfsd_preprocess(seqid, stateid); *stpp = NULL; - status = nfsd4_lookup_stateid(cstate, stateid, typemask, &s, nn); + status = nfsd4_lookup_stateid(cstate, stateid, + typemask, statusmask, &s, nn); if (status) return status; stp = openlockstateid(s); @@ -6870,7 +6864,7 @@ static __be32 nfs4_preprocess_confirmed_seqid_op(struct nfsd4_compound_state *cs struct nfs4_ol_stateid *stp; status = nfs4_preprocess_seqid_op(cstate, seqid, stateid, - NFS4_OPEN_STID, &stp, nn); + SC_TYPE_OPEN, 0, &stp, nn); if (status) return status; oo = openowner(stp->st_stateowner); @@ -6901,8 +6895,8 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, return status; status = nfs4_preprocess_seqid_op(cstate, - oc->oc_seqid, &oc->oc_req_stateid, - NFS4_OPEN_STID, &stp, nn); + oc->oc_seqid, &oc->oc_req_stateid, + SC_TYPE_OPEN, 0, &stp, nn); if (status) goto out; oo = openowner(stp->st_stateowner); @@ -7033,18 +7027,20 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd_net *nn = net_generic(net, nfsd_net_id); bool need_move_to_close_list; - dprintk("NFSD: nfsd4_close on file %pd\n", + dprintk("NFSD: nfsd4_close on file %pd\n", cstate->current_fh.fh_dentry); status = nfs4_preprocess_seqid_op(cstate, close->cl_seqid, - &close->cl_stateid, - NFS4_OPEN_STID|NFS4_CLOSED_STID, - &stp, nn); + &close->cl_stateid, + SC_TYPE_OPEN, SC_STATUS_CLOSED, + &stp, nn); nfsd4_bump_seqid(cstate, status); if (status) - goto out; + goto out; - stp->st_stid.sc_type = NFS4_CLOSED_STID; + spin_lock(&stp->st_stid.sc_client->cl_lock); + stp->st_stid.sc_status |= SC_STATUS_CLOSED; + spin_unlock(&stp->st_stid.sc_client->cl_lock); /* * Technically we don't _really_ have to increment or copy it, since @@ -7095,7 +7091,7 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0))) return status; - status = nfsd4_lookup_stateid(cstate, stateid, NFS4_DELEG_STID, &s, nn); + status = nfsd4_lookup_stateid(cstate, stateid, SC_TYPE_DELEG, 0, &s, nn); if (status) goto out; dp = delegstateid(s); @@ -7362,7 +7358,7 @@ init_lock_stateid(struct nfs4_ol_stateid *stp, struct nfs4_lockowner *lo, if (retstp) goto out_found; refcount_inc(&stp->st_stid.sc_count); - stp->st_stid.sc_type = NFS4_LOCK_STID; + stp->st_stid.sc_type = SC_TYPE_LOCK; stp->st_stateowner = nfs4_get_stateowner(&lo->lo_owner); get_nfs4_file(fp); stp->st_stid.sc_file = fp; @@ -7549,9 +7545,10 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, &lock_stp, &new); } else { status = nfs4_preprocess_seqid_op(cstate, - lock->lk_old_lock_seqid, - &lock->lk_old_lock_stateid, - NFS4_LOCK_STID, &lock_stp, nn); + lock->lk_old_lock_seqid, + &lock->lk_old_lock_stateid, + SC_TYPE_LOCK, 0, &lock_stp, + nn); } if (status) goto out; @@ -7864,8 +7861,8 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, return nfserr_inval; status = nfs4_preprocess_seqid_op(cstate, locku->lu_seqid, - &locku->lu_stateid, NFS4_LOCK_STID, - &stp, nn); + &locku->lu_stateid, SC_TYPE_LOCK, 0, + &stp, nn); if (status) goto out; nf = find_any_file(stp->st_stid.sc_file); @@ -8303,7 +8300,7 @@ nfs4_state_shutdown_net(struct net *net) spin_lock(&state_lock); list_for_each_safe(pos, next, &nn->del_recall_lru) { dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); - unhash_delegation_locked(dp, NFS4_CLOSED_DELEG_STID); + unhash_delegation_locked(dp, SC_STATUS_CLOSED); list_add(&dp->dl_recall_lru, &reaplist); } spin_unlock(&state_lock); diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 87c4372ba36a8d..1d4bf1a7d229c5 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -88,17 +88,33 @@ struct nfsd4_callback_ops { */ struct nfs4_stid { refcount_t sc_count; -#define NFS4_OPEN_STID 1 -#define NFS4_LOCK_STID 2 -#define NFS4_DELEG_STID 4 -/* For an open stateid kept around *only* to process close replays: */ -#define NFS4_CLOSED_STID 8 + + /* A new stateid is added to the cl_stateids idr early before it + * is fully initialised. Its sc_type is then zero. After + * initialisation the sc_type it set under cl_lock, and then + * never changes. + */ +#define SC_TYPE_OPEN BIT(0) +#define SC_TYPE_LOCK BIT(1) +#define SC_TYPE_DELEG BIT(2) +#define SC_TYPE_LAYOUT BIT(3) + unsigned short sc_type; + +/* state_lock protects sc_status for delegation stateids. + * ->cl_lock protects sc_status for open and lock stateids. + * ->st_mutex also protect sc_status for open stateids. + * ->ls_lock protects sc_status for layout stateids. + */ +/* + * For an open stateid kept around *only* to process close replays. + * For deleg stateid, kept in idr until last reference is dropped. + */ +#define SC_STATUS_CLOSED BIT(0) /* For a deleg stateid kept around only to process free_stateid's: */ -#define NFS4_REVOKED_DELEG_STID 16 -#define NFS4_CLOSED_DELEG_STID 32 -#define NFS4_LAYOUT_STID 64 +#define SC_STATUS_REVOKED BIT(1) + unsigned short sc_status; + struct list_head sc_cp_list; - unsigned char sc_type; stateid_t sc_stateid; spinlock_t sc_lock; struct nfs4_client *sc_client; @@ -672,15 +688,15 @@ extern __be32 nfs4_preprocess_stateid_op(struct svc_rqst *rqstp, stateid_t *stateid, int flags, struct nfsd_file **filp, struct nfs4_stid **cstid); __be32 nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate, - stateid_t *stateid, unsigned char typemask, - struct nfs4_stid **s, struct nfsd_net *nn); + stateid_t *stateid, unsigned short typemask, + unsigned short statusmask, + struct nfs4_stid **s, struct nfsd_net *nn); struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, struct kmem_cache *slab, void (*sc_free)(struct nfs4_stid *)); int nfs4_init_copy_state(struct nfsd_net *nn, struct nfsd4_copy *copy); void nfs4_free_copy_state(struct nfsd4_copy *copy); struct nfs4_cpntf_state *nfs4_alloc_init_cpntf_state(struct nfsd_net *nn, struct nfs4_stid *p_stid); -void nfs4_unhash_stid(struct nfs4_stid *s); void nfs4_put_stid(struct nfs4_stid *s); void nfs4_inc_and_copy_stateid(stateid_t *dst, struct nfs4_stid *stid); void nfs4_remove_reclaim_record(struct nfs4_client_reclaim *, struct nfsd_net *); diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index 9f9e58debc2611..f87dad1fa1d66d 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -643,23 +643,17 @@ DEFINE_EVENT(nfsd_stateseqid_class, nfsd_##name, \ DEFINE_STATESEQID_EVENT(preprocess); DEFINE_STATESEQID_EVENT(open_confirm); -TRACE_DEFINE_ENUM(NFS4_OPEN_STID); -TRACE_DEFINE_ENUM(NFS4_LOCK_STID); -TRACE_DEFINE_ENUM(NFS4_DELEG_STID); -TRACE_DEFINE_ENUM(NFS4_CLOSED_STID); -TRACE_DEFINE_ENUM(NFS4_REVOKED_DELEG_STID); -TRACE_DEFINE_ENUM(NFS4_CLOSED_DELEG_STID); -TRACE_DEFINE_ENUM(NFS4_LAYOUT_STID); - #define show_stid_type(x) \ __print_flags(x, "|", \ - { NFS4_OPEN_STID, "OPEN" }, \ - { NFS4_LOCK_STID, "LOCK" }, \ - { NFS4_DELEG_STID, "DELEG" }, \ - { NFS4_CLOSED_STID, "CLOSED" }, \ - { NFS4_REVOKED_DELEG_STID, "REVOKED" }, \ - { NFS4_CLOSED_DELEG_STID, "CLOSED_DELEG" }, \ - { NFS4_LAYOUT_STID, "LAYOUT" }) + { SC_TYPE_OPEN, "OPEN" }, \ + { SC_TYPE_LOCK, "LOCK" }, \ + { SC_TYPE_DELEG, "DELEG" }, \ + { SC_TYPE_LAYOUT, "LAYOUT" }) + +#define show_stid_status(x) \ + __print_flags(x, "|", \ + { SC_STATUS_CLOSED, "CLOSED" }, \ + { SC_STATUS_REVOKED, "REVOKED" }) \ DECLARE_EVENT_CLASS(nfsd_stid_class, TP_PROTO( @@ -668,6 +662,7 @@ DECLARE_EVENT_CLASS(nfsd_stid_class, TP_ARGS(stid), TP_STRUCT__entry( __field(unsigned long, sc_type) + __field(unsigned long, sc_status) __field(int, sc_count) __field(u32, cl_boot) __field(u32, cl_id) @@ -678,16 +673,18 @@ DECLARE_EVENT_CLASS(nfsd_stid_class, const stateid_t *stp = &stid->sc_stateid; __entry->sc_type = stid->sc_type; + __entry->sc_status = stid->sc_status; __entry->sc_count = refcount_read(&stid->sc_count); __entry->cl_boot = stp->si_opaque.so_clid.cl_boot; __entry->cl_id = stp->si_opaque.so_clid.cl_id; __entry->si_id = stp->si_opaque.so_id; __entry->si_generation = stp->si_generation; ), - TP_printk("client %08x:%08x stateid %08x:%08x ref=%d type=%s", + TP_printk("client %08x:%08x stateid %08x:%08x ref=%d type=%s state=%s", __entry->cl_boot, __entry->cl_id, __entry->si_id, __entry->si_generation, - __entry->sc_count, show_stid_type(__entry->sc_type) + __entry->sc_count, show_stid_type(__entry->sc_type), + show_stid_status(__entry->sc_status) ) ); From a020ae81970773ca59d5693f39e51fe8dcf8a389 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 30 Jan 2024 12:08:26 +1100 Subject: [PATCH 640/707] nfsd: prepare for supporting admin-revocation of state The NFSv4 protocol allows state to be revoked by the admin and has error codes which allow this to be communicated to the client. This patch - introduces a new state-id status SC_STATUS_ADMIN_REVOKED which can be set on open, lock, or delegation state. - reports NFS4ERR_ADMIN_REVOKED when these are accessed - introduces a per-client counter of these states and returns SEQ4_STATUS_ADMIN_STATE_REVOKED when the counter is not zero. Decrements this when freeing any admin-revoked state. - introduces stub code to find all interesting states for a given superblock so they can be revoked via the 'unlock_filesystem' file in /proc/fs/nfsd/ No actual states are handled yet. Reviewed-by: Jeff Layton Signed-off-by: NeilBrown Signed-off-by: Chuck Lever --- fs/nfsd/nfs4state.c | 85 ++++++++++++++++++++++++++++++++++++++++++++- fs/nfsd/nfsctl.c | 1 + fs/nfsd/nfsd.h | 1 + fs/nfsd/state.h | 10 ++++++ fs/nfsd/trace.h | 3 +- 5 files changed, 98 insertions(+), 2 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 58096ec81fb979..a19575571c059d 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1210,6 +1210,8 @@ nfs4_put_stid(struct nfs4_stid *s) return; } idr_remove(&clp->cl_stateids, s->sc_stateid.si_opaque.so_id); + if (s->sc_status & SC_STATUS_ADMIN_REVOKED) + atomic_dec(&s->sc_client->cl_admin_revoked); nfs4_free_cpntf_statelist(clp->net, s); spin_unlock(&clp->cl_lock); s->sc_free(s); @@ -1529,6 +1531,8 @@ static void put_ol_stateid_locked(struct nfs4_ol_stateid *stp, } idr_remove(&clp->cl_stateids, s->sc_stateid.si_opaque.so_id); + if (s->sc_status & SC_STATUS_ADMIN_REVOKED) + atomic_dec(&s->sc_client->cl_admin_revoked); list_add(&stp->st_locks, reaplist); } @@ -1674,6 +1678,68 @@ static void release_openowner(struct nfs4_openowner *oo) nfs4_put_stateowner(&oo->oo_owner); } +static struct nfs4_stid *find_one_sb_stid(struct nfs4_client *clp, + struct super_block *sb, + unsigned int sc_types) +{ + unsigned long id, tmp; + struct nfs4_stid *stid; + + spin_lock(&clp->cl_lock); + idr_for_each_entry_ul(&clp->cl_stateids, stid, tmp, id) + if ((stid->sc_type & sc_types) && + stid->sc_status == 0 && + stid->sc_file->fi_inode->i_sb == sb) { + refcount_inc(&stid->sc_count); + break; + } + spin_unlock(&clp->cl_lock); + return stid; +} + +/** + * nfsd4_revoke_states - revoke all nfsv4 states associated with given filesystem + * @net: used to identify instance of nfsd (there is one per net namespace) + * @sb: super_block used to identify target filesystem + * + * All nfs4 states (open, lock, delegation, layout) held by the server instance + * and associated with a file on the given filesystem will be revoked resulting + * in any files being closed and so all references from nfsd to the filesystem + * being released. Thus nfsd will no longer prevent the filesystem from being + * unmounted. + * + * The clients which own the states will subsequently being notified that the + * states have been "admin-revoked". + */ +void nfsd4_revoke_states(struct net *net, struct super_block *sb) +{ + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + unsigned int idhashval; + unsigned int sc_types; + + sc_types = 0; + + spin_lock(&nn->client_lock); + for (idhashval = 0; idhashval < CLIENT_HASH_MASK; idhashval++) { + struct list_head *head = &nn->conf_id_hashtbl[idhashval]; + struct nfs4_client *clp; + retry: + list_for_each_entry(clp, head, cl_idhash) { + struct nfs4_stid *stid = find_one_sb_stid(clp, sb, + sc_types); + if (stid) { + spin_unlock(&nn->client_lock); + switch (stid->sc_type) { + } + nfs4_put_stid(stid); + spin_lock(&nn->client_lock); + goto retry; + } + } + } + spin_unlock(&nn->client_lock); +} + static inline int hash_sessionid(struct nfs4_sessionid *sessionid) { @@ -2545,6 +2611,8 @@ static int client_info_show(struct seq_file *m, void *v) } seq_printf(m, "callback state: %s\n", cb_state2str(clp->cl_cb_state)); seq_printf(m, "callback address: %pISpc\n", &clp->cl_cb_conn.cb_addr); + seq_printf(m, "admin-revoked states: %d\n", + atomic_read(&clp->cl_admin_revoked)); drop_client(clp); return 0; @@ -4058,6 +4126,8 @@ nfsd4_sequence(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, } if (!list_empty(&clp->cl_revoked)) seq->status_flags |= SEQ4_STATUS_RECALLABLE_STATE_REVOKED; + if (atomic_read(&clp->cl_admin_revoked)) + seq->status_flags |= SEQ4_STATUS_ADMIN_STATE_REVOKED; trace_nfsd_seq4_status(rqstp, seq); out_no_session: if (conn) @@ -4547,7 +4617,9 @@ nfsd4_verify_open_stid(struct nfs4_stid *s) { __be32 ret = nfs_ok; - if (s->sc_status & SC_STATUS_REVOKED) + if (s->sc_status & SC_STATUS_ADMIN_REVOKED) + ret = nfserr_admin_revoked; + else if (s->sc_status & SC_STATUS_REVOKED) ret = nfserr_deleg_revoked; else if (s->sc_status & SC_STATUS_CLOSED) ret = nfserr_bad_stateid; @@ -5138,6 +5210,11 @@ nfs4_check_deleg(struct nfs4_client *cl, struct nfsd4_open *open, deleg = find_deleg_stateid(cl, &open->op_delegate_stateid); if (deleg == NULL) goto out; + if (deleg->dl_stid.sc_status & SC_STATUS_ADMIN_REVOKED) { + nfs4_put_stid(&deleg->dl_stid); + status = nfserr_admin_revoked; + goto out; + } if (deleg->dl_stid.sc_status & SC_STATUS_REVOKED) { nfs4_put_stid(&deleg->dl_stid); status = nfserr_deleg_revoked; @@ -6444,6 +6521,8 @@ nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate, */ statusmask |= SC_STATUS_REVOKED; + statusmask |= SC_STATUS_ADMIN_REVOKED; + if (ZERO_STATEID(stateid) || ONE_STATEID(stateid) || CLOSE_STATEID(stateid)) return nfserr_bad_stateid; @@ -6462,6 +6541,10 @@ nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate, nfs4_put_stid(stid); return nfserr_deleg_revoked; } + if (stid->sc_status & SC_STATUS_ADMIN_REVOKED) { + nfs4_put_stid(stid); + return nfserr_admin_revoked; + } *s = stid; return nfs_ok; } diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 5a5547bd6ecf7e..ecd18bffeebc75 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -281,6 +281,7 @@ static ssize_t write_unlock_fs(struct file *file, char *buf, size_t size) * 3. Is that directory the root of an exported file system? */ error = nlmsvc_unlock_all_by_sb(path.dentry->d_sb); + nfsd4_revoke_states(netns(file), path.dentry->d_sb); path_put(&path); return error; diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index be2ea3d6d2a289..8daf22d766c60a 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -275,6 +275,7 @@ void nfsd_lockd_shutdown(void); #define nfserr_no_grace cpu_to_be32(NFSERR_NO_GRACE) #define nfserr_reclaim_bad cpu_to_be32(NFSERR_RECLAIM_BAD) #define nfserr_badname cpu_to_be32(NFSERR_BADNAME) +#define nfserr_admin_revoked cpu_to_be32(NFS4ERR_ADMIN_REVOKED) #define nfserr_cb_path_down cpu_to_be32(NFSERR_CB_PATH_DOWN) #define nfserr_locked cpu_to_be32(NFSERR_LOCKED) #define nfserr_wrongsec cpu_to_be32(NFSERR_WRONGSEC) diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 1d4bf1a7d229c5..be02bf1a16bdd9 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -112,6 +112,7 @@ struct nfs4_stid { #define SC_STATUS_CLOSED BIT(0) /* For a deleg stateid kept around only to process free_stateid's: */ #define SC_STATUS_REVOKED BIT(1) +#define SC_STATUS_ADMIN_REVOKED BIT(2) unsigned short sc_status; struct list_head sc_cp_list; @@ -367,6 +368,7 @@ struct nfs4_client { clientid_t cl_clientid; /* generated by server */ nfs4_verifier cl_confirm; /* generated by server */ u32 cl_minorversion; + atomic_t cl_admin_revoked; /* count of admin-revoked states */ /* NFSv4.1 client implementation id: */ struct xdr_netobj cl_nii_domain; struct xdr_netobj cl_nii_name; @@ -730,6 +732,14 @@ static inline void get_nfs4_file(struct nfs4_file *fi) } struct nfsd_file *find_any_file(struct nfs4_file *f); +#ifdef CONFIG_NFSD_V4 +void nfsd4_revoke_states(struct net *net, struct super_block *sb); +#else +static inline void nfsd4_revoke_states(struct net *net, struct super_block *sb) +{ +} +#endif + /* grace period management */ void nfsd4_end_grace(struct nfsd_net *nn); diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index f87dad1fa1d66d..d8e56268a250ba 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -653,7 +653,8 @@ DEFINE_STATESEQID_EVENT(open_confirm); #define show_stid_status(x) \ __print_flags(x, "|", \ { SC_STATUS_CLOSED, "CLOSED" }, \ - { SC_STATUS_REVOKED, "REVOKED" }) \ + { SC_STATUS_REVOKED, "REVOKED" }, \ + { SC_STATUS_ADMIN_REVOKED, "ADMIN_REVOKED" }) DECLARE_EVENT_CLASS(nfsd_stid_class, TP_PROTO( From c563cc3d2f04a8a19d3e2963f2256fa3ae70445f Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 30 Jan 2024 12:08:27 +1100 Subject: [PATCH 641/707] nfsd: allow state with no file to appear in /proc/fs/nfsd/clients/*/states Change the "show" functions to show some content even if a file cannot be found. This is the case for admin-revoked state. This is primarily useful for debugging - to ensure states are being removed eventually. So change several seq_printf() to seq_puts(). Some of these are needed to keep checkpatch happy. Others were done for consistency. Reviewed-by: Jeff Layton Signed-off-by: NeilBrown Signed-off-by: Chuck Lever --- fs/nfsd/nfs4state.c | 118 ++++++++++++++++++++++---------------------- 1 file changed, 58 insertions(+), 60 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index a19575571c059d..a05b6ab81ecffa 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -2554,9 +2554,9 @@ static struct nfs4_client *get_nfsdfs_clp(struct inode *inode) static void seq_quote_mem(struct seq_file *m, char *data, int len) { - seq_printf(m, "\""); + seq_puts(m, "\""); seq_escape_mem(m, data, len, ESCAPE_HEX | ESCAPE_NAP | ESCAPE_APPEND, "\"\\"); - seq_printf(m, "\""); + seq_puts(m, "\""); } static const char *cb_state2str(int state) @@ -2597,14 +2597,14 @@ static int client_info_show(struct seq_file *m, void *v) seq_puts(m, "status: unconfirmed\n"); seq_printf(m, "seconds from last renew: %lld\n", ktime_get_boottime_seconds() - clp->cl_time); - seq_printf(m, "name: "); + seq_puts(m, "name: "); seq_quote_mem(m, clp->cl_name.data, clp->cl_name.len); seq_printf(m, "\nminor version: %d\n", clp->cl_minorversion); if (clp->cl_nii_domain.data) { - seq_printf(m, "Implementation domain: "); + seq_puts(m, "Implementation domain: "); seq_quote_mem(m, clp->cl_nii_domain.data, clp->cl_nii_domain.len); - seq_printf(m, "\nImplementation name: "); + seq_puts(m, "\nImplementation name: "); seq_quote_mem(m, clp->cl_nii_name.data, clp->cl_nii_name.len); seq_printf(m, "\nImplementation time: [%lld, %ld]\n", clp->cl_nii_time.tv_sec, clp->cl_nii_time.tv_nsec); @@ -2671,7 +2671,7 @@ static void nfs4_show_superblock(struct seq_file *s, struct nfsd_file *f) static void nfs4_show_owner(struct seq_file *s, struct nfs4_stateowner *oo) { - seq_printf(s, "owner: "); + seq_puts(s, "owner: "); seq_quote_mem(s, oo->so_owner.data, oo->so_owner.len); } @@ -2689,20 +2689,13 @@ static int nfs4_show_open(struct seq_file *s, struct nfs4_stid *st) struct nfs4_stateowner *oo; unsigned int access, deny; - if (st->sc_type != SC_TYPE_OPEN && st->sc_type != SC_TYPE_LOCK) - return 0; /* XXX: or SEQ_SKIP? */ ols = openlockstateid(st); oo = ols->st_stateowner; nf = st->sc_file; - spin_lock(&nf->fi_lock); - file = find_any_file_locked(nf); - if (!file) - goto out; - - seq_printf(s, "- "); + seq_puts(s, "- "); nfs4_show_stateid(s, &st->sc_stateid); - seq_printf(s, ": { type: open, "); + seq_puts(s, ": { type: open, "); access = bmap_to_share_mode(ols->st_access_bmap); deny = bmap_to_share_mode(ols->st_deny_bmap); @@ -2714,14 +2707,17 @@ static int nfs4_show_open(struct seq_file *s, struct nfs4_stid *st) deny & NFS4_SHARE_ACCESS_READ ? "r" : "-", deny & NFS4_SHARE_ACCESS_WRITE ? "w" : "-"); - nfs4_show_superblock(s, file); - seq_printf(s, ", "); - nfs4_show_fname(s, file); - seq_printf(s, ", "); - nfs4_show_owner(s, oo); - seq_printf(s, " }\n"); -out: + spin_lock(&nf->fi_lock); + file = find_any_file_locked(nf); + if (file) { + nfs4_show_superblock(s, file); + seq_puts(s, ", "); + nfs4_show_fname(s, file); + seq_puts(s, ", "); + } spin_unlock(&nf->fi_lock); + nfs4_show_owner(s, oo); + seq_puts(s, " }\n"); return 0; } @@ -2735,30 +2731,29 @@ static int nfs4_show_lock(struct seq_file *s, struct nfs4_stid *st) ols = openlockstateid(st); oo = ols->st_stateowner; nf = st->sc_file; - spin_lock(&nf->fi_lock); - file = find_any_file_locked(nf); - if (!file) - goto out; - seq_printf(s, "- "); + seq_puts(s, "- "); nfs4_show_stateid(s, &st->sc_stateid); - seq_printf(s, ": { type: lock, "); + seq_puts(s, ": { type: lock, "); - /* - * Note: a lock stateid isn't really the same thing as a lock, - * it's the locking state held by one owner on a file, and there - * may be multiple (or no) lock ranges associated with it. - * (Same for the matter is true of open stateids.) - */ + spin_lock(&nf->fi_lock); + file = find_any_file_locked(nf); + if (file) { + /* + * Note: a lock stateid isn't really the same thing as a lock, + * it's the locking state held by one owner on a file, and there + * may be multiple (or no) lock ranges associated with it. + * (Same for the matter is true of open stateids.) + */ - nfs4_show_superblock(s, file); - /* XXX: open stateid? */ - seq_printf(s, ", "); - nfs4_show_fname(s, file); - seq_printf(s, ", "); + nfs4_show_superblock(s, file); + /* XXX: open stateid? */ + seq_puts(s, ", "); + nfs4_show_fname(s, file); + seq_puts(s, ", "); + } nfs4_show_owner(s, oo); - seq_printf(s, " }\n"); -out: + seq_puts(s, " }\n"); spin_unlock(&nf->fi_lock); return 0; } @@ -2771,25 +2766,25 @@ static int nfs4_show_deleg(struct seq_file *s, struct nfs4_stid *st) ds = delegstateid(st); nf = st->sc_file; - spin_lock(&nf->fi_lock); - file = nf->fi_deleg_file; - if (!file) - goto out; - seq_printf(s, "- "); + seq_puts(s, "- "); nfs4_show_stateid(s, &st->sc_stateid); - seq_printf(s, ": { type: deleg, "); + seq_puts(s, ": { type: deleg, "); - seq_printf(s, "access: %s, ", - ds->dl_type == NFS4_OPEN_DELEGATE_READ ? "r" : "w"); + seq_printf(s, "access: %s", + ds->dl_type == NFS4_OPEN_DELEGATE_READ ? "r" : "w"); /* XXX: lease time, whether it's being recalled. */ - nfs4_show_superblock(s, file); - seq_printf(s, ", "); - nfs4_show_fname(s, file); - seq_printf(s, " }\n"); -out: + spin_lock(&nf->fi_lock); + file = nf->fi_deleg_file; + if (file) { + seq_puts(s, ", "); + nfs4_show_superblock(s, file); + seq_puts(s, ", "); + nfs4_show_fname(s, file); + } + seq_puts(s, " }\n"); spin_unlock(&nf->fi_lock); return 0; } @@ -2802,16 +2797,19 @@ static int nfs4_show_layout(struct seq_file *s, struct nfs4_stid *st) ls = container_of(st, struct nfs4_layout_stateid, ls_stid); file = ls->ls_file; - seq_printf(s, "- "); + seq_puts(s, "- "); nfs4_show_stateid(s, &st->sc_stateid); - seq_printf(s, ": { type: layout, "); + seq_puts(s, ": { type: layout"); /* XXX: What else would be useful? */ - nfs4_show_superblock(s, file); - seq_printf(s, ", "); - nfs4_show_fname(s, file); - seq_printf(s, " }\n"); + if (file) { + seq_puts(s, ", "); + nfs4_show_superblock(s, file); + seq_puts(s, ", "); + nfs4_show_fname(s, file); + } + seq_puts(s, " }\n"); return 0; } From 3ea830e6073c78cef16e39ab74a4c6311bf34624 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 30 Jan 2024 12:08:28 +1100 Subject: [PATCH 642/707] nfsd: report in /proc/fs/nfsd/clients/*/states when state is admin-revoke Add "admin-revoked" to the status information for any states that have been admin-revoked. This can be useful for confirming correct behaviour. Reviewed-by: Jeff Layton Signed-off-by: NeilBrown Signed-off-by: Chuck Lever --- fs/nfsd/nfs4state.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index a05b6ab81ecffa..823239f681538d 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -2717,6 +2717,8 @@ static int nfs4_show_open(struct seq_file *s, struct nfs4_stid *st) } spin_unlock(&nf->fi_lock); nfs4_show_owner(s, oo); + if (st->sc_status & SC_STATUS_ADMIN_REVOKED) + seq_puts(s, ", admin-revoked"); seq_puts(s, " }\n"); return 0; } @@ -2753,6 +2755,8 @@ static int nfs4_show_lock(struct seq_file *s, struct nfs4_stid *st) seq_puts(s, ", "); } nfs4_show_owner(s, oo); + if (st->sc_status & SC_STATUS_ADMIN_REVOKED) + seq_puts(s, ", admin-revoked"); seq_puts(s, " }\n"); spin_unlock(&nf->fi_lock); return 0; @@ -2784,8 +2788,10 @@ static int nfs4_show_deleg(struct seq_file *s, struct nfs4_stid *st) seq_puts(s, ", "); nfs4_show_fname(s, file); } - seq_puts(s, " }\n"); spin_unlock(&nf->fi_lock); + if (st->sc_status & SC_STATUS_ADMIN_REVOKED) + seq_puts(s, ", admin-revoked"); + seq_puts(s, " }\n"); return 0; } @@ -2809,6 +2815,8 @@ static int nfs4_show_layout(struct seq_file *s, struct nfs4_stid *st) seq_puts(s, ", "); nfs4_show_fname(s, file); } + if (st->sc_status & SC_STATUS_ADMIN_REVOKED) + seq_puts(s, ", admin-revoked"); seq_puts(s, " }\n"); return 0; From f83397a22ed1db59c89c63437892a49a9a1540c7 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 30 Jan 2024 12:08:29 +1100 Subject: [PATCH 643/707] nfsd: allow admin-revoked NFSv4.0 state to be freed. For NFSv4.1 and later the client easily discovers if there is any admin-revoked state and will then find and explicitly free it. For NFSv4.0 there is no such mechanism. The client can only find that state is admin-revoked if it tries to use that state, and there is no way for it to explicitly free the state. So the server must hold on to the stateid (at least) for an indefinite amount of time. A RELEASE_LOCKOWNER request might justify forgetting some of these stateids, as would the whole clients lease lapsing, but these are not reliable. This patch takes two approaches. Whenever a client uses an revoked stateid, that stateid is then discarded and will not be recognised again. This might confuse a client which expect to get NFS4ERR_ADMIN_REVOKED consistently once it get it at all, but should mostly work. Hopefully one error will lead to other resources being closed (e.g. process exits), which will result in more stateid being freed when a CLOSE attempt gets NFS4ERR_ADMIN_REVOKED. Also, any admin-revoked stateids that have been that way for more than one lease time are periodically revoke. No actual freeing of state happens in this patch. That will come in future patches which handle the different sorts of revoked state. Reviewed-by: Jeff Layton Signed-off-by: NeilBrown Signed-off-by: Chuck Lever --- fs/nfsd/netns.h | 4 ++ fs/nfsd/nfs4state.c | 98 ++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 101 insertions(+), 1 deletion(-) diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index afc16ee4da7428..d4be519b5734e3 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -209,6 +209,10 @@ struct nfsd_net { atomic_t nfsd_courtesy_clients; struct shrinker *nfsd_client_shrinker; struct work_struct nfsd_shrinker_work; + + /* last time an admin-revoke happened for NFSv4.0 */ + time64_t nfs40_last_revoke; + }; /* Simple check to find out if a given net was properly initialized */ diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 823239f681538d..ab7f4e25f2a11d 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1733,6 +1733,14 @@ void nfsd4_revoke_states(struct net *net, struct super_block *sb) } nfs4_put_stid(stid); spin_lock(&nn->client_lock); + if (clp->cl_minorversion == 0) + /* Allow cleanup after a lease period. + * store_release ensures cleanup will + * see any newly revoked states if it + * sees the time updated. + */ + nn->nfs40_last_revoke = + ktime_get_boottime_seconds(); goto retry; } } @@ -4618,6 +4626,40 @@ nfsd4_find_existing_open(struct nfs4_file *fp, struct nfsd4_open *open) return ret; } +static void nfsd4_drop_revoked_stid(struct nfs4_stid *s) + __releases(&s->sc_client->cl_lock) +{ + struct nfs4_client *cl = s->sc_client; + + switch (s->sc_type) { + default: + spin_unlock(&cl->cl_lock); + } +} + +static void nfsd40_drop_revoked_stid(struct nfs4_client *cl, + stateid_t *stid) +{ + /* NFSv4.0 has no way for the client to tell the server + * that it can forget an admin-revoked stateid. + * So we keep it around until the first time that the + * client uses it, and drop it the first time + * nfserr_admin_revoked is returned. + * For v4.1 and later we wait until explicitly told + * to free the stateid. + */ + if (cl->cl_minorversion == 0) { + struct nfs4_stid *st; + + spin_lock(&cl->cl_lock); + st = find_stateid_locked(cl, stid); + if (st) + nfsd4_drop_revoked_stid(st); + else + spin_unlock(&cl->cl_lock); + } +} + static __be32 nfsd4_verify_open_stid(struct nfs4_stid *s) { @@ -4640,6 +4682,10 @@ nfsd4_lock_ol_stateid(struct nfs4_ol_stateid *stp) mutex_lock_nested(&stp->st_mutex, LOCK_STATEID_MUTEX); ret = nfsd4_verify_open_stid(&stp->st_stid); + if (ret == nfserr_admin_revoked) + nfsd40_drop_revoked_stid(stp->st_stid.sc_client, + &stp->st_stid.sc_stateid); + if (ret != nfs_ok) mutex_unlock(&stp->st_mutex); return ret; @@ -5223,6 +5269,7 @@ nfs4_check_deleg(struct nfs4_client *cl, struct nfsd4_open *open, } if (deleg->dl_stid.sc_status & SC_STATUS_REVOKED) { nfs4_put_stid(&deleg->dl_stid); + nfsd40_drop_revoked_stid(cl, &open->op_delegate_stateid); status = nfserr_deleg_revoked; goto out; } @@ -6207,6 +6254,43 @@ nfs4_process_client_reaplist(struct list_head *reaplist) } } +static void nfs40_clean_admin_revoked(struct nfsd_net *nn, + struct laundry_time *lt) +{ + struct nfs4_client *clp; + + spin_lock(&nn->client_lock); + if (nn->nfs40_last_revoke == 0 || + nn->nfs40_last_revoke > lt->cutoff) { + spin_unlock(&nn->client_lock); + return; + } + nn->nfs40_last_revoke = 0; + +retry: + list_for_each_entry(clp, &nn->client_lru, cl_lru) { + unsigned long id, tmp; + struct nfs4_stid *stid; + + if (atomic_read(&clp->cl_admin_revoked) == 0) + continue; + + spin_lock(&clp->cl_lock); + idr_for_each_entry_ul(&clp->cl_stateids, stid, tmp, id) + if (stid->sc_status & SC_STATUS_ADMIN_REVOKED) { + refcount_inc(&stid->sc_count); + spin_unlock(&nn->client_lock); + /* this function drops ->cl_lock */ + nfsd4_drop_revoked_stid(stid); + nfs4_put_stid(stid); + spin_lock(&nn->client_lock); + goto retry; + } + spin_unlock(&clp->cl_lock); + } + spin_unlock(&nn->client_lock); +} + static time64_t nfs4_laundromat(struct nfsd_net *nn) { @@ -6240,6 +6324,8 @@ nfs4_laundromat(struct nfsd_net *nn) nfs4_get_client_reaplist(nn, &reaplist, <); nfs4_process_client_reaplist(&reaplist); + nfs40_clean_admin_revoked(nn, <); + spin_lock(&state_lock); list_for_each_safe(pos, next, &nn->del_recall_lru) { dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); @@ -6458,6 +6544,9 @@ static __be32 nfsd4_stid_check_stateid_generation(stateid_t *in, struct nfs4_sti if (ret == nfs_ok) ret = check_stateid_generation(in, &s->sc_stateid, has_session); spin_unlock(&s->sc_lock); + if (ret == nfserr_admin_revoked) + nfsd40_drop_revoked_stid(s->sc_client, + &s->sc_stateid); return ret; } @@ -6502,6 +6591,8 @@ static __be32 nfsd4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid) } out_unlock: spin_unlock(&cl->cl_lock); + if (status == nfserr_admin_revoked) + nfsd40_drop_revoked_stid(cl, stateid); return status; } @@ -6548,6 +6639,7 @@ nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate, return nfserr_deleg_revoked; } if (stid->sc_status & SC_STATUS_ADMIN_REVOKED) { + nfsd40_drop_revoked_stid(cstate->clp, stateid); nfs4_put_stid(stid); return nfserr_admin_revoked; } @@ -6840,6 +6932,11 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, s = find_stateid_locked(cl, stateid); if (!s || s->sc_status & SC_STATUS_CLOSED) goto out_unlock; + if (s->sc_status & SC_STATUS_ADMIN_REVOKED) { + nfsd4_drop_revoked_stid(s); + ret = nfs_ok; + goto out; + } spin_lock(&s->sc_lock); switch (s->sc_type) { case SC_TYPE_DELEG: @@ -6866,7 +6963,6 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, spin_unlock(&cl->cl_lock); ret = nfsd4_free_lock_stateid(stateid, s); goto out; - /* Default falls through and returns nfserr_bad_stateid */ } spin_unlock(&s->sc_lock); out_unlock: From 7ba079c80c2c7eb7b56b6482df38c54f62513ca5 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 30 Jan 2024 12:08:30 +1100 Subject: [PATCH 644/707] nfsd: allow lock state ids to be revoked and then freed Revoking state through 'unlock_filesystem' now revokes any lock states found. When the stateids are then freed by the client, the revoked stateids will be cleaned up correctly. Reviewed-by: Jeff Layton Signed-off-by: NeilBrown Signed-off-by: Chuck Lever --- fs/nfsd/nfs4state.c | 40 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index ab7f4e25f2a11d..4d5b0a798a6a65 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1717,7 +1717,7 @@ void nfsd4_revoke_states(struct net *net, struct super_block *sb) unsigned int idhashval; unsigned int sc_types; - sc_types = 0; + sc_types = SC_TYPE_LOCK; spin_lock(&nn->client_lock); for (idhashval = 0; idhashval < CLIENT_HASH_MASK; idhashval++) { @@ -1728,8 +1728,36 @@ void nfsd4_revoke_states(struct net *net, struct super_block *sb) struct nfs4_stid *stid = find_one_sb_stid(clp, sb, sc_types); if (stid) { + struct nfs4_ol_stateid *stp; + spin_unlock(&nn->client_lock); switch (stid->sc_type) { + case SC_TYPE_LOCK: + stp = openlockstateid(stid); + mutex_lock_nested(&stp->st_mutex, + LOCK_STATEID_MUTEX); + spin_lock(&clp->cl_lock); + if (stid->sc_status == 0) { + struct nfs4_lockowner *lo = + lockowner(stp->st_stateowner); + struct nfsd_file *nf; + + stid->sc_status |= + SC_STATUS_ADMIN_REVOKED; + atomic_inc(&clp->cl_admin_revoked); + spin_unlock(&clp->cl_lock); + nf = find_any_file(stp->st_stid.sc_file); + if (nf) { + get_file(nf->nf_file); + filp_close(nf->nf_file, + (fl_owner_t)lo); + nfsd_file_put(nf); + } + release_all_access(stp); + } else + spin_unlock(&clp->cl_lock); + mutex_unlock(&stp->st_mutex); + break; } nfs4_put_stid(stid); spin_lock(&nn->client_lock); @@ -4630,8 +4658,18 @@ static void nfsd4_drop_revoked_stid(struct nfs4_stid *s) __releases(&s->sc_client->cl_lock) { struct nfs4_client *cl = s->sc_client; + LIST_HEAD(reaplist); + struct nfs4_ol_stateid *stp; + bool unhashed; switch (s->sc_type) { + case SC_TYPE_LOCK: + stp = openlockstateid(s); + unhashed = unhash_lock_stateid(stp); + spin_unlock(&cl->cl_lock); + if (unhashed) + nfs4_put_stid(s); + break; default: spin_unlock(&cl->cl_lock); } From 4988411481a938927a929afe6be4803c0bf695ae Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 30 Jan 2024 12:08:31 +1100 Subject: [PATCH 645/707] nfsd: allow open state ids to be revoked and then freed Revoking state through 'unlock_filesystem' now revokes any open states found. When the stateids are then freed by the client, the revoked stateids will be cleaned up correctly. Possibly the related lock states should be revoked too, but a subsequent patch will do that for all lock state on the superblock. Reviewed-by: Jeff Layton Signed-off-by: NeilBrown Signed-off-by: Chuck Lever --- fs/nfsd/nfs4state.c | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 4d5b0a798a6a65..daf61f26e609e1 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1717,7 +1717,7 @@ void nfsd4_revoke_states(struct net *net, struct super_block *sb) unsigned int idhashval; unsigned int sc_types; - sc_types = SC_TYPE_LOCK; + sc_types = SC_TYPE_OPEN | SC_TYPE_LOCK; spin_lock(&nn->client_lock); for (idhashval = 0; idhashval < CLIENT_HASH_MASK; idhashval++) { @@ -1732,6 +1732,22 @@ void nfsd4_revoke_states(struct net *net, struct super_block *sb) spin_unlock(&nn->client_lock); switch (stid->sc_type) { + case SC_TYPE_OPEN: + stp = openlockstateid(stid); + mutex_lock_nested(&stp->st_mutex, + OPEN_STATEID_MUTEX); + + spin_lock(&clp->cl_lock); + if (stid->sc_status == 0) { + stid->sc_status |= + SC_STATUS_ADMIN_REVOKED; + atomic_inc(&clp->cl_admin_revoked); + spin_unlock(&clp->cl_lock); + release_all_access(stp); + } else + spin_unlock(&clp->cl_lock); + mutex_unlock(&stp->st_mutex); + break; case SC_TYPE_LOCK: stp = openlockstateid(stid); mutex_lock_nested(&stp->st_mutex, @@ -4663,6 +4679,13 @@ static void nfsd4_drop_revoked_stid(struct nfs4_stid *s) bool unhashed; switch (s->sc_type) { + case SC_TYPE_OPEN: + stp = openlockstateid(s); + if (unhash_open_stateid(stp, &reaplist)) + put_ol_stateid_locked(stp, &reaplist); + spin_unlock(&cl->cl_lock); + free_ol_stateid_reaplist(&reaplist); + break; case SC_TYPE_LOCK: stp = openlockstateid(s); unhashed = unhash_lock_stateid(stp); From e221f3b9aeef4b2d62da9b20d28131bb2a107620 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 30 Jan 2024 12:08:32 +1100 Subject: [PATCH 646/707] nfsd: allow delegation state ids to be revoked and then freed Revoking state through 'unlock_filesystem' now revokes any delegation states found. When the stateids are then freed by the client, the revoked stateids will be cleaned up correctly. As there is already support for revoking delegations, we build on that for admin-revoking. Reviewed-by: Jeff Layton Signed-off-by: NeilBrown Signed-off-by: Chuck Lever --- fs/nfsd/nfs4state.c | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index daf61f26e609e1..fe21af8dfc68e5 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1335,9 +1335,12 @@ unhash_delegation_locked(struct nfs4_delegation *dp, unsigned short statusmask) if (!delegation_hashed(dp)) return false; - if (dp->dl_stid.sc_client->cl_minorversion == 0) + if (statusmask == SC_STATUS_REVOKED && + dp->dl_stid.sc_client->cl_minorversion == 0) statusmask = SC_STATUS_CLOSED; dp->dl_stid.sc_status |= statusmask; + if (statusmask & SC_STATUS_ADMIN_REVOKED) + atomic_inc(&dp->dl_stid.sc_client->cl_admin_revoked); /* Ensure that deleg break won't try to requeue it */ ++dp->dl_time; @@ -1368,7 +1371,8 @@ static void revoke_delegation(struct nfs4_delegation *dp) trace_nfsd_stid_revoke(&dp->dl_stid); - if (dp->dl_stid.sc_status & SC_STATUS_REVOKED) { + if (dp->dl_stid.sc_status & + (SC_STATUS_REVOKED | SC_STATUS_ADMIN_REVOKED)) { spin_lock(&clp->cl_lock); refcount_inc(&dp->dl_stid.sc_count); list_add(&dp->dl_recall_lru, &clp->cl_revoked); @@ -1717,7 +1721,7 @@ void nfsd4_revoke_states(struct net *net, struct super_block *sb) unsigned int idhashval; unsigned int sc_types; - sc_types = SC_TYPE_OPEN | SC_TYPE_LOCK; + sc_types = SC_TYPE_OPEN | SC_TYPE_LOCK | SC_TYPE_DELEG; spin_lock(&nn->client_lock); for (idhashval = 0; idhashval < CLIENT_HASH_MASK; idhashval++) { @@ -1729,6 +1733,7 @@ void nfsd4_revoke_states(struct net *net, struct super_block *sb) sc_types); if (stid) { struct nfs4_ol_stateid *stp; + struct nfs4_delegation *dp; spin_unlock(&nn->client_lock); switch (stid->sc_type) { @@ -1774,6 +1779,16 @@ void nfsd4_revoke_states(struct net *net, struct super_block *sb) spin_unlock(&clp->cl_lock); mutex_unlock(&stp->st_mutex); break; + case SC_TYPE_DELEG: + dp = delegstateid(stid); + spin_lock(&state_lock); + if (!unhash_delegation_locked( + dp, SC_STATUS_ADMIN_REVOKED)) + dp = NULL; + spin_unlock(&state_lock); + if (dp) + revoke_delegation(dp); + break; } nfs4_put_stid(stid); spin_lock(&nn->client_lock); @@ -4676,6 +4691,7 @@ static void nfsd4_drop_revoked_stid(struct nfs4_stid *s) struct nfs4_client *cl = s->sc_client; LIST_HEAD(reaplist); struct nfs4_ol_stateid *stp; + struct nfs4_delegation *dp; bool unhashed; switch (s->sc_type) { @@ -4693,6 +4709,12 @@ static void nfsd4_drop_revoked_stid(struct nfs4_stid *s) if (unhashed) nfs4_put_stid(s); break; + case SC_TYPE_DELEG: + dp = delegstateid(s); + list_del_init(&dp->dl_recall_lru); + spin_unlock(&cl->cl_lock); + nfs4_put_stid(s); + break; default: spin_unlock(&cl->cl_lock); } From f46d3e4afbb59769808dd4304cfede8fe106f60a Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 30 Jan 2024 12:08:33 +1100 Subject: [PATCH 647/707] nfsd: allow layout state to be admin-revoked. When there is layout state on a filesystem that is being "unlocked" that is now revoked, which involves closing the nfsd_file and releasing the vfs lease. To avoid races, ->ls_file can now be accessed either: - under ->fi_lock for the state's sc_file or - under rcu_read_lock() if nfsd_file_get() is used. To support this, ->fence_client and nfsd4_cb_layout_fail() now take a second argument being the nfsd_file. Reviewed-by: Jeff Layton Signed-off-by: NeilBrown Signed-off-by: Chuck Lever --- fs/nfsd/blocklayout.c | 4 ++-- fs/nfsd/nfs4layouts.c | 43 ++++++++++++++++++++++++++++++++----------- fs/nfsd/nfs4state.c | 11 +++++++++-- fs/nfsd/pnfs.h | 8 +++++++- 4 files changed, 50 insertions(+), 16 deletions(-) diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c index 46fd74d91ea929..3c040c81c77d01 100644 --- a/fs/nfsd/blocklayout.c +++ b/fs/nfsd/blocklayout.c @@ -328,10 +328,10 @@ nfsd4_scsi_proc_layoutcommit(struct inode *inode, } static void -nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls) +nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls, struct nfsd_file *file) { struct nfs4_client *clp = ls->ls_stid.sc_client; - struct block_device *bdev = ls->ls_file->nf_file->f_path.mnt->mnt_sb->s_bdev; + struct block_device *bdev = file->nf_file->f_path.mnt->mnt_sb->s_bdev; bdev->bd_disk->fops->pr_ops->pr_preempt(bdev, NFSD_MDS_PR_KEY, nfsd4_scsi_pr_key(clp), 0, true); diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c index 857b822450b4fe..1cfd61db247297 100644 --- a/fs/nfsd/nfs4layouts.c +++ b/fs/nfsd/nfs4layouts.c @@ -152,6 +152,23 @@ void nfsd4_setup_layout_type(struct svc_export *exp) #endif } +void nfsd4_close_layout(struct nfs4_layout_stateid *ls) +{ + struct nfsd_file *fl; + + spin_lock(&ls->ls_stid.sc_file->fi_lock); + fl = ls->ls_file; + ls->ls_file = NULL; + spin_unlock(&ls->ls_stid.sc_file->fi_lock); + + if (fl) { + if (!nfsd4_layout_ops[ls->ls_layout_type]->disable_recalls) + vfs_setlease(fl->nf_file, F_UNLCK, NULL, + (void **)&ls); + nfsd_file_put(fl); + } +} + static void nfsd4_free_layout_stateid(struct nfs4_stid *stid) { @@ -169,9 +186,7 @@ nfsd4_free_layout_stateid(struct nfs4_stid *stid) list_del_init(&ls->ls_perfile); spin_unlock(&fp->fi_lock); - if (!nfsd4_layout_ops[ls->ls_layout_type]->disable_recalls) - vfs_setlease(ls->ls_file->nf_file, F_UNLCK, NULL, (void **)&ls); - nfsd_file_put(ls->ls_file); + nfsd4_close_layout(ls); if (ls->ls_recalled) atomic_dec(&ls->ls_stid.sc_file->fi_lo_recalls); @@ -605,7 +620,7 @@ nfsd4_return_all_file_layouts(struct nfs4_client *clp, struct nfs4_file *fp) } static void -nfsd4_cb_layout_fail(struct nfs4_layout_stateid *ls) +nfsd4_cb_layout_fail(struct nfs4_layout_stateid *ls, struct nfsd_file *file) { struct nfs4_client *clp = ls->ls_stid.sc_client; char addr_str[INET6_ADDRSTRLEN]; @@ -627,7 +642,7 @@ nfsd4_cb_layout_fail(struct nfs4_layout_stateid *ls) argv[0] = (char *)nfsd_recall_failed; argv[1] = addr_str; - argv[2] = ls->ls_file->nf_file->f_path.mnt->mnt_sb->s_id; + argv[2] = file->nf_file->f_path.mnt->mnt_sb->s_id; argv[3] = NULL; error = call_usermodehelper(nfsd_recall_failed, argv, envp, @@ -657,6 +672,7 @@ nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task) struct nfsd_net *nn; ktime_t now, cutoff; const struct nfsd4_layout_ops *ops; + struct nfsd_file *fl; trace_nfsd_cb_layout_done(&ls->ls_stid.sc_stateid, task); switch (task->tk_status) { @@ -688,12 +704,17 @@ nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task) * Unknown error or non-responding client, we'll need to fence. */ trace_nfsd_layout_recall_fail(&ls->ls_stid.sc_stateid); - - ops = nfsd4_layout_ops[ls->ls_layout_type]; - if (ops->fence_client) - ops->fence_client(ls); - else - nfsd4_cb_layout_fail(ls); + rcu_read_lock(); + fl = nfsd_file_get(ls->ls_file); + rcu_read_unlock(); + if (fl) { + ops = nfsd4_layout_ops[ls->ls_layout_type]; + if (ops->fence_client) + ops->fence_client(ls, fl); + else + nfsd4_cb_layout_fail(ls, fl); + nfsd_file_put(fl); + } return 1; case -NFS4ERR_NOMATCHING_LAYOUT: trace_nfsd_layout_recall_done(&ls->ls_stid.sc_stateid); diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index fe21af8dfc68e5..a66d66b9f76918 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1721,7 +1721,7 @@ void nfsd4_revoke_states(struct net *net, struct super_block *sb) unsigned int idhashval; unsigned int sc_types; - sc_types = SC_TYPE_OPEN | SC_TYPE_LOCK | SC_TYPE_DELEG; + sc_types = SC_TYPE_OPEN | SC_TYPE_LOCK | SC_TYPE_DELEG | SC_TYPE_LAYOUT; spin_lock(&nn->client_lock); for (idhashval = 0; idhashval < CLIENT_HASH_MASK; idhashval++) { @@ -1734,6 +1734,7 @@ void nfsd4_revoke_states(struct net *net, struct super_block *sb) if (stid) { struct nfs4_ol_stateid *stp; struct nfs4_delegation *dp; + struct nfs4_layout_stateid *ls; spin_unlock(&nn->client_lock); switch (stid->sc_type) { @@ -1789,6 +1790,10 @@ void nfsd4_revoke_states(struct net *net, struct super_block *sb) if (dp) revoke_delegation(dp); break; + case SC_TYPE_LAYOUT: + ls = layoutstateid(stid); + nfsd4_close_layout(ls); + break; } nfs4_put_stid(stid); spin_lock(&nn->client_lock); @@ -2868,7 +2873,6 @@ static int nfs4_show_layout(struct seq_file *s, struct nfs4_stid *st) struct nfsd_file *file; ls = container_of(st, struct nfs4_layout_stateid, ls_stid); - file = ls->ls_file; seq_puts(s, "- "); nfs4_show_stateid(s, &st->sc_stateid); @@ -2876,12 +2880,15 @@ static int nfs4_show_layout(struct seq_file *s, struct nfs4_stid *st) /* XXX: What else would be useful? */ + spin_lock(&ls->ls_stid.sc_file->fi_lock); + file = ls->ls_file; if (file) { seq_puts(s, ", "); nfs4_show_superblock(s, file); seq_puts(s, ", "); nfs4_show_fname(s, file); } + spin_unlock(&ls->ls_stid.sc_file->fi_lock); if (st->sc_status & SC_STATUS_ADMIN_REVOKED) seq_puts(s, ", admin-revoked"); seq_puts(s, " }\n"); diff --git a/fs/nfsd/pnfs.h b/fs/nfsd/pnfs.h index de1e0dfed06a23..925817f669176c 100644 --- a/fs/nfsd/pnfs.h +++ b/fs/nfsd/pnfs.h @@ -37,7 +37,8 @@ struct nfsd4_layout_ops { __be32 (*proc_layoutcommit)(struct inode *inode, struct nfsd4_layoutcommit *lcp); - void (*fence_client)(struct nfs4_layout_stateid *ls); + void (*fence_client)(struct nfs4_layout_stateid *ls, + struct nfsd_file *file); }; extern const struct nfsd4_layout_ops *nfsd4_layout_ops[]; @@ -72,11 +73,13 @@ void nfsd4_setup_layout_type(struct svc_export *exp); void nfsd4_return_all_client_layouts(struct nfs4_client *); void nfsd4_return_all_file_layouts(struct nfs4_client *clp, struct nfs4_file *fp); +void nfsd4_close_layout(struct nfs4_layout_stateid *ls); int nfsd4_init_pnfs(void); void nfsd4_exit_pnfs(void); #else struct nfs4_client; struct nfs4_file; +struct nfs4_layout_stateid; static inline void nfsd4_setup_layout_type(struct svc_export *exp) { @@ -89,6 +92,9 @@ static inline void nfsd4_return_all_file_layouts(struct nfs4_client *clp, struct nfs4_file *fp) { } +static inline void nfsd4_close_layout(struct nfs4_layout_stateid *ls) +{ +} static inline void nfsd4_exit_pnfs(void) { } From 23fed3b5e4abb0d0ce72191f7cc6dcec09fcfa76 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 31 Jan 2024 11:17:40 +1100 Subject: [PATCH 648/707] nfsd: don't call locks_release_private() twice concurrently It is possible for free_blocked_lock() to be called twice concurrently, once from nfsd4_lock() and once from nfsd4_release_lockowner() calling remove_blocked_locks(). This is why a kref was added. It is perfectly safe for locks_delete_block() and kref_put() to be called in parallel as they use locking or atomicity respectively as protection. However locks_release_private() has no locking. It is safe for it to be called twice sequentially, but not concurrently. This patch moves that call from free_blocked_lock() where it could race with itself, to free_nbl() where it cannot. This will slightly delay the freeing of private info or release of the owner - but not by much. It is arguably more natural for this freeing to happen in free_nbl() where the structure itself is freed. This bug was found by code inspection - it has not been seen in practice. Fixes: 47446d74f170 ("nfsd4: add refcount for nfsd4_blocked_lock") Signed-off-by: NeilBrown Signed-off-by: Chuck Lever --- fs/nfsd/nfs4state.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index a66d66b9f76918..12534e12dbb363 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -318,6 +318,7 @@ free_nbl(struct kref *kref) struct nfsd4_blocked_lock *nbl; nbl = container_of(kref, struct nfsd4_blocked_lock, nbl_kref); + locks_release_private(&nbl->nbl_lock); kfree(nbl); } @@ -325,7 +326,6 @@ static void free_blocked_lock(struct nfsd4_blocked_lock *nbl) { locks_delete_block(&nbl->nbl_lock); - locks_release_private(&nbl->nbl_lock); kref_put(&nbl->nbl_kref, free_nbl); } From 74967bfbffe468e2ddc4e5bc8f3bfb0c2c8a2888 Mon Sep 17 00:00:00 2001 From: Kunwu Chan Date: Wed, 31 Jan 2024 14:22:27 +0800 Subject: [PATCH 649/707] nfsd: Simplify the allocation of slab caches in nfsd4_init_pnfs commit 0a31bd5f2bbb ("KMEM_CACHE(): simplify slab cache creation") introduces a new macro. Use the new KMEM_CACHE() macro instead of direct kmem_cache_create to simplify the creation of SLAB caches. Signed-off-by: Kunwu Chan Signed-off-by: Chuck Lever --- fs/nfsd/nfs4layouts.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c index 1cfd61db247297..b1e585c1d9a3aa 100644 --- a/fs/nfsd/nfs4layouts.c +++ b/fs/nfsd/nfs4layouts.c @@ -777,13 +777,11 @@ nfsd4_init_pnfs(void) for (i = 0; i < DEVID_HASH_SIZE; i++) INIT_LIST_HEAD(&nfsd_devid_hash[i]); - nfs4_layout_cache = kmem_cache_create("nfs4_layout", - sizeof(struct nfs4_layout), 0, 0, NULL); + nfs4_layout_cache = KMEM_CACHE(nfs4_layout, 0); if (!nfs4_layout_cache) return -ENOMEM; - nfs4_layout_stateid_cache = kmem_cache_create("nfs4_layout_stateid", - sizeof(struct nfs4_layout_stateid), 0, 0, NULL); + nfs4_layout_stateid_cache = KMEM_CACHE(nfs4_layout_stateid, 0); if (!nfs4_layout_stateid_cache) { kmem_cache_destroy(nfs4_layout_cache); return -ENOMEM; From 6c1c91f97746154611770e14f9be4d65a52f77c6 Mon Sep 17 00:00:00 2001 From: Kunwu Chan Date: Wed, 31 Jan 2024 14:56:53 +0800 Subject: [PATCH 650/707] nfsd: Simplify the allocation of slab caches in nfsd_file_cache_init Use the new KMEM_CACHE() macro instead of direct kmem_cache_create to simplify the creation of SLAB caches. Signed-off-by: Kunwu Chan Acked-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/filecache.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index 8d9f7b07e35b39..f3a642fd0ecaa8 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -722,15 +722,13 @@ nfsd_file_cache_init(void) return ret; ret = -ENOMEM; - nfsd_file_slab = kmem_cache_create("nfsd_file", - sizeof(struct nfsd_file), 0, 0, NULL); + nfsd_file_slab = KMEM_CACHE(nfsd_file, 0); if (!nfsd_file_slab) { pr_err("nfsd: unable to create nfsd_file_slab\n"); goto out_err; } - nfsd_file_mark_slab = kmem_cache_create("nfsd_file_mark", - sizeof(struct nfsd_file_mark), 0, 0, NULL); + nfsd_file_mark_slab = KMEM_CACHE(nfsd_file_mark, 0); if (!nfsd_file_mark_slab) { pr_err("nfsd: unable to create nfsd_file_mark_slab\n"); goto out_err; From 2c30c237b18027aa9d29c5d99f988541791f80ae Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 31 Jan 2024 14:26:02 +0100 Subject: [PATCH 651/707] pidfd: implement PIDFD_THREAD flag for pidfd_open() With this flag: - pidfd_open() doesn't require that the target task must be a thread-group leader - pidfd_poll() succeeds when the task exits and becomes a zombie (iow, passes exit_notify()), even if it is a leader and thread-group is not empty. This means that the behaviour of pidfd_poll(PIDFD_THREAD, pid-of-group-leader) is not well defined if it races with exec() from its sub-thread; pidfd_poll() can succeed or not depending on whether pidfd_task_exited() is called before or after exchange_tids(). Perhaps we can improve this behaviour later, pidfd_poll() can probably take sig->group_exec_task into account. But this doesn't really differ from the case when the leader exits before other threads (so pidfd_poll() succeeds) and then another thread execs and pidfd_poll() will block again. thread_group_exited() is no longer used, perhaps it can die. Co-developed-by: Tycho Andersen Signed-off-by: Oleg Nesterov Link: https://lore.kernel.org/r/20240131132602.GA23641@redhat.com Tested-by: Tycho Andersen Reviewed-by: Tycho Andersen Signed-off-by: Christian Brauner --- fs/exec.c | 6 +++++- include/linux/pid.h | 3 ++- include/uapi/linux/pidfd.h | 3 ++- kernel/exit.c | 7 +++++++ kernel/fork.c | 38 +++++++++++++++++++++++++++++++------- kernel/pid.c | 14 +++----------- kernel/signal.c | 6 ++++-- 7 files changed, 54 insertions(+), 23 deletions(-) diff --git a/fs/exec.c b/fs/exec.c index 8cdd5b2dd09c2e..b68f61bbcaa82e 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1143,7 +1143,11 @@ static int de_thread(struct task_struct *tsk) BUG_ON(leader->exit_state != EXIT_ZOMBIE); leader->exit_state = EXIT_DEAD; - + /* + * leader and tsk exhanged their pids, the old pid dies, + * wake up the PIDFD_THREAD waiters. + */ + do_notify_pidfd(leader); /* * We are going to release_task()->ptrace_unlink() silently, * the tracer can sleep in do_wait(). EXIT_DEAD guarantees diff --git a/include/linux/pid.h b/include/linux/pid.h index e6a041cb8bacc7..8124d57752b938 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -70,10 +70,11 @@ extern const struct file_operations pidfd_fops; struct file; -extern struct pid *pidfd_pid(const struct file *file); +struct pid *pidfd_pid(const struct file *file); struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags); struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags); int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret); +void do_notify_pidfd(struct task_struct *task); static inline struct pid *get_pid(struct pid *pid) { diff --git a/include/uapi/linux/pidfd.h b/include/uapi/linux/pidfd.h index 5406fbc1307489..2e6461459877ba 100644 --- a/include/uapi/linux/pidfd.h +++ b/include/uapi/linux/pidfd.h @@ -7,6 +7,7 @@ #include /* Flags for pidfd_open(). */ -#define PIDFD_NONBLOCK O_NONBLOCK +#define PIDFD_NONBLOCK O_NONBLOCK +#define PIDFD_THREAD O_EXCL #endif /* _UAPI_LINUX_PIDFD_H */ diff --git a/kernel/exit.c b/kernel/exit.c index 3988a02efaef06..c038d10dfb3868 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -739,6 +739,13 @@ static void exit_notify(struct task_struct *tsk, int group_dead) kill_orphaned_pgrp(tsk->group_leader, NULL); tsk->exit_state = EXIT_ZOMBIE; + /* + * sub-thread or delay_group_leader(), wake up the + * PIDFD_THREAD waiters. + */ + if (!thread_group_empty(tsk)) + do_notify_pidfd(tsk); + if (unlikely(tsk->ptrace)) { int sig = thread_group_leader(tsk) && thread_group_empty(tsk) && diff --git a/kernel/fork.c b/kernel/fork.c index 726a92043531ff..1a9b910559169c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -101,6 +101,7 @@ #include #include #include +#include #include #include @@ -2050,6 +2051,8 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f) seq_put_decimal_ll(m, "Pid:\t", nr); + /* TODO: report PIDFD_THREAD */ + #ifdef CONFIG_PID_NS seq_put_decimal_ll(m, "\nNSpid:\t", nr); if (nr > 0) { @@ -2068,22 +2071,35 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f) } #endif +static bool pidfd_task_exited(struct pid *pid, bool thread) +{ + struct task_struct *task; + bool exited; + + rcu_read_lock(); + task = pid_task(pid, PIDTYPE_PID); + exited = !task || + (READ_ONCE(task->exit_state) && (thread || thread_group_empty(task))); + rcu_read_unlock(); + + return exited; +} + /* * Poll support for process exit notification. */ static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts) { struct pid *pid = file->private_data; + bool thread = file->f_flags & PIDFD_THREAD; __poll_t poll_flags = 0; poll_wait(file, &pid->wait_pidfd, pts); - /* - * Inform pollers only when the whole thread group exits. - * If the thread group leader exits before all other threads in the - * group, then poll(2) should block, similar to the wait(2) family. + * Depending on PIDFD_THREAD, inform pollers when the thread + * or the whole thread-group exits. */ - if (thread_group_exited(pid)) + if (pidfd_task_exited(pid, thread)) poll_flags = EPOLLIN | EPOLLRDNORM; return poll_flags; @@ -2141,6 +2157,11 @@ static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **re return PTR_ERR(pidfd_file); } get_pid(pid); /* held by pidfd_file now */ + /* + * anon_inode_getfile() ignores everything outside of the + * O_ACCMODE | O_NONBLOCK mask, set PIDFD_THREAD manually. + */ + pidfd_file->f_flags |= (flags & PIDFD_THREAD); *ret = pidfd_file; return pidfd; } @@ -2154,7 +2175,8 @@ static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **re * Allocate a new file that stashes @pid and reserve a new pidfd number in the * caller's file descriptor table. The pidfd is reserved but not installed yet. * - * The helper verifies that @pid is used as a thread group leader. + * The helper verifies that @pid is still in use, without PIDFD_THREAD the + * task identified by @pid must be a thread-group leader. * * If this function returns successfully the caller is responsible to either * call fd_install() passing the returned pidfd and pidfd file as arguments in @@ -2173,7 +2195,9 @@ static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **re */ int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret) { - if (!pid || !pid_has_task(pid, PIDTYPE_TGID)) + bool thread = flags & PIDFD_THREAD; + + if (!pid || !pid_has_task(pid, thread ? PIDTYPE_PID : PIDTYPE_TGID)) return -EINVAL; return __pidfd_prepare(pid, flags, ret); diff --git a/kernel/pid.c b/kernel/pid.c index c7a3e359f8f590..e1114446682802 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -552,11 +552,6 @@ struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags) * Return the task associated with @pidfd. The function takes a reference on * the returned task. The caller is responsible for releasing that reference. * - * Currently, the process identified by @pidfd is always a thread-group leader. - * This restriction currently exists for all aspects of pidfds including pidfd - * creation (CLONE_PIDFD cannot be used with CLONE_THREAD) and pidfd polling - * (only supports thread group leaders). - * * Return: On success, the task_struct associated with the pidfd. * On error, a negative errno number will be returned. */ @@ -615,11 +610,8 @@ static int pidfd_create(struct pid *pid, unsigned int flags) * @flags: flags to pass * * This creates a new pid file descriptor with the O_CLOEXEC flag set for - * the process identified by @pid. Currently, the process identified by - * @pid must be a thread-group leader. This restriction currently exists - * for all aspects of pidfds including pidfd creation (CLONE_PIDFD cannot - * be used with CLONE_THREAD) and pidfd polling (only supports thread group - * leaders). + * the task identified by @pid. Without PIDFD_THREAD flag the target task + * must be a thread-group leader. * * Return: On success, a cloexec pidfd is returned. * On error, a negative errno number will be returned. @@ -629,7 +621,7 @@ SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags) int fd; struct pid *p; - if (flags & ~PIDFD_NONBLOCK) + if (flags & ~(PIDFD_NONBLOCK | PIDFD_THREAD)) return -EINVAL; if (pid <= 0) diff --git a/kernel/signal.c b/kernel/signal.c index 9561a3962ca687..9b40109f0c5648 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2019,7 +2019,7 @@ int send_sigqueue(struct sigqueue *q, struct pid *pid, enum pid_type type) return ret; } -static void do_notify_pidfd(struct task_struct *task) +void do_notify_pidfd(struct task_struct *task) { struct pid *pid; @@ -2051,7 +2051,8 @@ bool do_notify_parent(struct task_struct *tsk, int sig) WARN_ON_ONCE(!tsk->ptrace && (tsk->group_leader != tsk || !thread_group_empty(tsk))); /* - * tsk is a group leader and has no threads, wake up the pidfd waiters. + * tsk is a group leader and has no threads, wake up the + * non-PIDFD_THREAD waiters. */ if (thread_group_empty(tsk)) do_notify_pidfd(tsk); @@ -3926,6 +3927,7 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig, prepare_kill_siginfo(sig, &kinfo); } + /* TODO: respect PIDFD_THREAD */ ret = kill_pid_info(sig, &kinfo, pid); err: From 2747e0ee57c2742dacf27920e815d87c6ab62643 Mon Sep 17 00:00:00 2001 From: Daniel Xu Date: Sun, 28 Jan 2024 18:24:07 -0700 Subject: [PATCH 652/707] bpf: btf: Add BTF_KFUNCS_START/END macro pair This macro pair is functionally equivalent to BTF_SET8_START/END, except with BTF_SET8_KFUNCS flag set in the btf_id_set8 flags field. The next commit will codemod all kfunc set8s to this new variant such that all kfuncs are tagged as such in .BTF_ids section. Signed-off-by: Daniel Xu Link: https://lore.kernel.org/r/d536c57c7c2af428686853cc7396b7a44faa53b7.1706491398.git.dxu@dxuuu.xyz Signed-off-by: Alexei Starovoitov --- include/linux/btf_ids.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/include/linux/btf_ids.h b/include/linux/btf_ids.h index dca09b7f21dc28..0fe4f1cd1918f9 100644 --- a/include/linux/btf_ids.h +++ b/include/linux/btf_ids.h @@ -8,6 +8,9 @@ struct btf_id_set { u32 ids[]; }; +/* This flag implies BTF_SET8 holds kfunc(s) */ +#define BTF_SET8_KFUNCS (1 << 0) + struct btf_id_set8 { u32 cnt; u32 flags; @@ -204,6 +207,12 @@ asm( \ ".popsection; \n"); \ extern struct btf_id_set8 name; +#define BTF_KFUNCS_START(name) \ +__BTF_SET8_START(name, local, BTF_SET8_KFUNCS) + +#define BTF_KFUNCS_END(name) \ +BTF_SET8_END(name) + #else #define BTF_ID_LIST(name) static u32 __maybe_unused name[64]; @@ -218,6 +227,8 @@ extern struct btf_id_set8 name; #define BTF_SET_END(name) #define BTF_SET8_START(name) static struct btf_id_set8 __maybe_unused name = { 0 }; #define BTF_SET8_END(name) +#define BTF_KFUNCS_START(name) static struct btf_id_set8 __maybe_unused name = { 0 }; +#define BTF_KFUNCS_END(name) #endif /* CONFIG_DEBUG_INFO_BTF */ From 6e7769e6419f7836227c74da1f569961e5de0c0e Mon Sep 17 00:00:00 2001 From: Daniel Xu Date: Sun, 28 Jan 2024 18:24:08 -0700 Subject: [PATCH 653/707] bpf: treewide: Annotate BPF kfuncs in BTF This commit marks kfuncs as such inside the .BTF_ids section. The upshot of these annotations is that we'll be able to automatically generate kfunc prototypes for downstream users. The process is as follows: 1. In source, use BTF_KFUNCS_START/END macro pair to mark kfuncs 2. During build, pahole injects into BTF a "bpf_kfunc" BTF_DECL_TAG for each function inside BTF_KFUNCS sets 3. At runtime, vmlinux or module BTF is made available in sysfs 4. At runtime, bpftool (or similar) can look at provided BTF and generate appropriate prototypes for functions with "bpf_kfunc" tag To ensure future kfunc are similarly tagged, we now also return error inside kfunc registration for untagged kfuncs. For vmlinux kfuncs, we also WARN(), as initcall machinery does not handle errors. Signed-off-by: Daniel Xu Acked-by: Benjamin Tissoires Link: https://lore.kernel.org/r/e55150ceecbf0a5d961e608941165c0bee7bc943.1706491398.git.dxu@dxuuu.xyz Signed-off-by: Alexei Starovoitov --- Documentation/bpf/kfuncs.rst | 8 ++++---- drivers/hid/bpf/hid_bpf_dispatch.c | 8 ++++---- fs/verity/measure.c | 4 ++-- kernel/bpf/btf.c | 8 ++++++++ kernel/bpf/cpumask.c | 4 ++-- kernel/bpf/helpers.c | 8 ++++---- kernel/bpf/map_iter.c | 4 ++-- kernel/cgroup/rstat.c | 4 ++-- kernel/trace/bpf_trace.c | 8 ++++---- net/bpf/test_run.c | 8 ++++---- net/core/filter.c | 20 +++++++++---------- net/core/xdp.c | 4 ++-- net/ipv4/bpf_tcp_ca.c | 4 ++-- net/ipv4/fou_bpf.c | 4 ++-- net/ipv4/tcp_bbr.c | 4 ++-- net/ipv4/tcp_cubic.c | 4 ++-- net/ipv4/tcp_dctcp.c | 4 ++-- net/netfilter/nf_conntrack_bpf.c | 4 ++-- net/netfilter/nf_nat_bpf.c | 4 ++-- net/xfrm/xfrm_interface_bpf.c | 4 ++-- net/xfrm/xfrm_state_bpf.c | 4 ++-- .../selftests/bpf/bpf_testmod/bpf_testmod.c | 8 ++++---- 22 files changed, 70 insertions(+), 62 deletions(-) diff --git a/Documentation/bpf/kfuncs.rst b/Documentation/bpf/kfuncs.rst index 7985c6615f3c2f..a8f5782bd83318 100644 --- a/Documentation/bpf/kfuncs.rst +++ b/Documentation/bpf/kfuncs.rst @@ -177,10 +177,10 @@ In addition to kfuncs' arguments, verifier may need more information about the type of kfunc(s) being registered with the BPF subsystem. To do so, we define flags on a set of kfuncs as follows:: - BTF_SET8_START(bpf_task_set) + BTF_KFUNCS_START(bpf_task_set) BTF_ID_FLAGS(func, bpf_get_task_pid, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_put_pid, KF_RELEASE) - BTF_SET8_END(bpf_task_set) + BTF_KFUNCS_END(bpf_task_set) This set encodes the BTF ID of each kfunc listed above, and encodes the flags along with it. Ofcourse, it is also allowed to specify no flags. @@ -347,10 +347,10 @@ Once the kfunc is prepared for use, the final step to making it visible is registering it with the BPF subsystem. Registration is done per BPF program type. An example is shown below:: - BTF_SET8_START(bpf_task_set) + BTF_KFUNCS_START(bpf_task_set) BTF_ID_FLAGS(func, bpf_get_task_pid, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_put_pid, KF_RELEASE) - BTF_SET8_END(bpf_task_set) + BTF_KFUNCS_END(bpf_task_set) static const struct btf_kfunc_id_set bpf_task_kfunc_set = { .owner = THIS_MODULE, diff --git a/drivers/hid/bpf/hid_bpf_dispatch.c b/drivers/hid/bpf/hid_bpf_dispatch.c index d9ef45fcaeab13..02c441aaa21751 100644 --- a/drivers/hid/bpf/hid_bpf_dispatch.c +++ b/drivers/hid/bpf/hid_bpf_dispatch.c @@ -172,9 +172,9 @@ hid_bpf_get_data(struct hid_bpf_ctx *ctx, unsigned int offset, const size_t rdwr * The following set contains all functions we agree BPF programs * can use. */ -BTF_SET8_START(hid_bpf_kfunc_ids) +BTF_KFUNCS_START(hid_bpf_kfunc_ids) BTF_ID_FLAGS(func, hid_bpf_get_data, KF_RET_NULL) -BTF_SET8_END(hid_bpf_kfunc_ids) +BTF_KFUNCS_END(hid_bpf_kfunc_ids) static const struct btf_kfunc_id_set hid_bpf_kfunc_set = { .owner = THIS_MODULE, @@ -440,12 +440,12 @@ static const struct btf_kfunc_id_set hid_bpf_fmodret_set = { }; /* for syscall HID-BPF */ -BTF_SET8_START(hid_bpf_syscall_kfunc_ids) +BTF_KFUNCS_START(hid_bpf_syscall_kfunc_ids) BTF_ID_FLAGS(func, hid_bpf_attach_prog) BTF_ID_FLAGS(func, hid_bpf_allocate_context, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, hid_bpf_release_context, KF_RELEASE) BTF_ID_FLAGS(func, hid_bpf_hw_request) -BTF_SET8_END(hid_bpf_syscall_kfunc_ids) +BTF_KFUNCS_END(hid_bpf_syscall_kfunc_ids) static const struct btf_kfunc_id_set hid_bpf_syscall_kfunc_set = { .owner = THIS_MODULE, diff --git a/fs/verity/measure.c b/fs/verity/measure.c index bf7a5f4cccaf04..3969d54158d128 100644 --- a/fs/verity/measure.c +++ b/fs/verity/measure.c @@ -159,9 +159,9 @@ __bpf_kfunc int bpf_get_fsverity_digest(struct file *file, struct bpf_dynptr_ker __bpf_kfunc_end_defs(); -BTF_SET8_START(fsverity_set_ids) +BTF_KFUNCS_START(fsverity_set_ids) BTF_ID_FLAGS(func, bpf_get_fsverity_digest, KF_TRUSTED_ARGS) -BTF_SET8_END(fsverity_set_ids) +BTF_KFUNCS_END(fsverity_set_ids) static int bpf_get_fsverity_digest_filter(const struct bpf_prog *prog, u32 kfunc_id) { diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index c8c6e6cf18e7f4..ef380e5469521b 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -8124,6 +8124,14 @@ int register_btf_kfunc_id_set(enum bpf_prog_type prog_type, { enum btf_kfunc_hook hook; + /* All kfuncs need to be tagged as such in BTF. + * WARN() for initcall registrations that do not check errors. + */ + if (!(kset->set->flags & BTF_SET8_KFUNCS)) { + WARN_ON(!kset->owner); + return -EINVAL; + } + hook = bpf_prog_type_to_kfunc_hook(prog_type); return __register_btf_kfunc_id_set(hook, kset); } diff --git a/kernel/bpf/cpumask.c b/kernel/bpf/cpumask.c index 2e73533a3811cd..dad0fb1c8e876f 100644 --- a/kernel/bpf/cpumask.c +++ b/kernel/bpf/cpumask.c @@ -424,7 +424,7 @@ __bpf_kfunc u32 bpf_cpumask_weight(const struct cpumask *cpumask) __bpf_kfunc_end_defs(); -BTF_SET8_START(cpumask_kfunc_btf_ids) +BTF_KFUNCS_START(cpumask_kfunc_btf_ids) BTF_ID_FLAGS(func, bpf_cpumask_create, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_cpumask_release, KF_RELEASE) BTF_ID_FLAGS(func, bpf_cpumask_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS) @@ -450,7 +450,7 @@ BTF_ID_FLAGS(func, bpf_cpumask_copy, KF_RCU) BTF_ID_FLAGS(func, bpf_cpumask_any_distribute, KF_RCU) BTF_ID_FLAGS(func, bpf_cpumask_any_and_distribute, KF_RCU) BTF_ID_FLAGS(func, bpf_cpumask_weight, KF_RCU) -BTF_SET8_END(cpumask_kfunc_btf_ids) +BTF_KFUNCS_END(cpumask_kfunc_btf_ids) static const struct btf_kfunc_id_set cpumask_kfunc_set = { .owner = THIS_MODULE, diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index bcb951a2ecf4b9..4db1c658254c17 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2544,7 +2544,7 @@ __bpf_kfunc void bpf_throw(u64 cookie) __bpf_kfunc_end_defs(); -BTF_SET8_START(generic_btf_ids) +BTF_KFUNCS_START(generic_btf_ids) #ifdef CONFIG_KEXEC_CORE BTF_ID_FLAGS(func, crash_kexec, KF_DESTRUCTIVE) #endif @@ -2573,7 +2573,7 @@ BTF_ID_FLAGS(func, bpf_task_get_cgroup1, KF_ACQUIRE | KF_RCU | KF_RET_NULL) #endif BTF_ID_FLAGS(func, bpf_task_from_pid, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_throw) -BTF_SET8_END(generic_btf_ids) +BTF_KFUNCS_END(generic_btf_ids) static const struct btf_kfunc_id_set generic_kfunc_set = { .owner = THIS_MODULE, @@ -2589,7 +2589,7 @@ BTF_ID(struct, cgroup) BTF_ID(func, bpf_cgroup_release_dtor) #endif -BTF_SET8_START(common_btf_ids) +BTF_KFUNCS_START(common_btf_ids) BTF_ID_FLAGS(func, bpf_cast_to_kern_ctx) BTF_ID_FLAGS(func, bpf_rdonly_cast) BTF_ID_FLAGS(func, bpf_rcu_read_lock) @@ -2618,7 +2618,7 @@ BTF_ID_FLAGS(func, bpf_dynptr_is_null) BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly) BTF_ID_FLAGS(func, bpf_dynptr_size) BTF_ID_FLAGS(func, bpf_dynptr_clone) -BTF_SET8_END(common_btf_ids) +BTF_KFUNCS_END(common_btf_ids) static const struct btf_kfunc_id_set common_kfunc_set = { .owner = THIS_MODULE, diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c index 6abd7c5df4b39e..9575314f40a692 100644 --- a/kernel/bpf/map_iter.c +++ b/kernel/bpf/map_iter.c @@ -213,9 +213,9 @@ __bpf_kfunc s64 bpf_map_sum_elem_count(const struct bpf_map *map) __bpf_kfunc_end_defs(); -BTF_SET8_START(bpf_map_iter_kfunc_ids) +BTF_KFUNCS_START(bpf_map_iter_kfunc_ids) BTF_ID_FLAGS(func, bpf_map_sum_elem_count, KF_TRUSTED_ARGS) -BTF_SET8_END(bpf_map_iter_kfunc_ids) +BTF_KFUNCS_END(bpf_map_iter_kfunc_ids) static const struct btf_kfunc_id_set bpf_map_iter_kfunc_set = { .owner = THIS_MODULE, diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index a8350d2d63e6b1..07e2284bb49971 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -562,10 +562,10 @@ void cgroup_base_stat_cputime_show(struct seq_file *seq) } /* Add bpf kfuncs for cgroup_rstat_updated() and cgroup_rstat_flush() */ -BTF_SET8_START(bpf_rstat_kfunc_ids) +BTF_KFUNCS_START(bpf_rstat_kfunc_ids) BTF_ID_FLAGS(func, cgroup_rstat_updated) BTF_ID_FLAGS(func, cgroup_rstat_flush, KF_SLEEPABLE) -BTF_SET8_END(bpf_rstat_kfunc_ids) +BTF_KFUNCS_END(bpf_rstat_kfunc_ids) static const struct btf_kfunc_id_set bpf_rstat_kfunc_set = { .owner = THIS_MODULE, diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 64fdaf79d11365..241ddf5e38953e 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1412,14 +1412,14 @@ __bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr_kern *data_ptr, __bpf_kfunc_end_defs(); -BTF_SET8_START(key_sig_kfunc_set) +BTF_KFUNCS_START(key_sig_kfunc_set) BTF_ID_FLAGS(func, bpf_lookup_user_key, KF_ACQUIRE | KF_RET_NULL | KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_lookup_system_key, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_key_put, KF_RELEASE) #ifdef CONFIG_SYSTEM_DATA_VERIFICATION BTF_ID_FLAGS(func, bpf_verify_pkcs7_signature, KF_SLEEPABLE) #endif -BTF_SET8_END(key_sig_kfunc_set) +BTF_KFUNCS_END(key_sig_kfunc_set) static const struct btf_kfunc_id_set bpf_key_sig_kfunc_set = { .owner = THIS_MODULE, @@ -1475,9 +1475,9 @@ __bpf_kfunc int bpf_get_file_xattr(struct file *file, const char *name__str, __bpf_kfunc_end_defs(); -BTF_SET8_START(fs_kfunc_set_ids) +BTF_KFUNCS_START(fs_kfunc_set_ids) BTF_ID_FLAGS(func, bpf_get_file_xattr, KF_SLEEPABLE | KF_TRUSTED_ARGS) -BTF_SET8_END(fs_kfunc_set_ids) +BTF_KFUNCS_END(fs_kfunc_set_ids) static int bpf_get_file_xattr_filter(const struct bpf_prog *prog, u32 kfunc_id) { diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index dfd91937401783..5535f9adc6589d 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -617,21 +617,21 @@ CFI_NOSEAL(bpf_kfunc_call_memb_release_dtor); __bpf_kfunc_end_defs(); -BTF_SET8_START(bpf_test_modify_return_ids) +BTF_KFUNCS_START(bpf_test_modify_return_ids) BTF_ID_FLAGS(func, bpf_modify_return_test) BTF_ID_FLAGS(func, bpf_modify_return_test2) BTF_ID_FLAGS(func, bpf_fentry_test1, KF_SLEEPABLE) -BTF_SET8_END(bpf_test_modify_return_ids) +BTF_KFUNCS_END(bpf_test_modify_return_ids) static const struct btf_kfunc_id_set bpf_test_modify_return_set = { .owner = THIS_MODULE, .set = &bpf_test_modify_return_ids, }; -BTF_SET8_START(test_sk_check_kfunc_ids) +BTF_KFUNCS_START(test_sk_check_kfunc_ids) BTF_ID_FLAGS(func, bpf_kfunc_call_test_release, KF_RELEASE) BTF_ID_FLAGS(func, bpf_kfunc_call_memb_release, KF_RELEASE) -BTF_SET8_END(test_sk_check_kfunc_ids) +BTF_KFUNCS_END(test_sk_check_kfunc_ids) static void *bpf_test_init(const union bpf_attr *kattr, u32 user_size, u32 size, u32 headroom, u32 tailroom) diff --git a/net/core/filter.c b/net/core/filter.c index 358870408a51e6..524adf1fa6d019 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -11982,21 +11982,21 @@ int bpf_dynptr_from_skb_rdonly(struct sk_buff *skb, u64 flags, return 0; } -BTF_SET8_START(bpf_kfunc_check_set_skb) +BTF_KFUNCS_START(bpf_kfunc_check_set_skb) BTF_ID_FLAGS(func, bpf_dynptr_from_skb) -BTF_SET8_END(bpf_kfunc_check_set_skb) +BTF_KFUNCS_END(bpf_kfunc_check_set_skb) -BTF_SET8_START(bpf_kfunc_check_set_xdp) +BTF_KFUNCS_START(bpf_kfunc_check_set_xdp) BTF_ID_FLAGS(func, bpf_dynptr_from_xdp) -BTF_SET8_END(bpf_kfunc_check_set_xdp) +BTF_KFUNCS_END(bpf_kfunc_check_set_xdp) -BTF_SET8_START(bpf_kfunc_check_set_sock_addr) +BTF_KFUNCS_START(bpf_kfunc_check_set_sock_addr) BTF_ID_FLAGS(func, bpf_sock_addr_set_sun_path) -BTF_SET8_END(bpf_kfunc_check_set_sock_addr) +BTF_KFUNCS_END(bpf_kfunc_check_set_sock_addr) -BTF_SET8_START(bpf_kfunc_check_set_tcp_reqsk) +BTF_KFUNCS_START(bpf_kfunc_check_set_tcp_reqsk) BTF_ID_FLAGS(func, bpf_sk_assign_tcp_reqsk, KF_TRUSTED_ARGS) -BTF_SET8_END(bpf_kfunc_check_set_tcp_reqsk) +BTF_KFUNCS_END(bpf_kfunc_check_set_tcp_reqsk) static const struct btf_kfunc_id_set bpf_kfunc_set_skb = { .owner = THIS_MODULE, @@ -12075,9 +12075,9 @@ __bpf_kfunc int bpf_sock_destroy(struct sock_common *sock) __bpf_kfunc_end_defs(); -BTF_SET8_START(bpf_sk_iter_kfunc_ids) +BTF_KFUNCS_START(bpf_sk_iter_kfunc_ids) BTF_ID_FLAGS(func, bpf_sock_destroy, KF_TRUSTED_ARGS) -BTF_SET8_END(bpf_sk_iter_kfunc_ids) +BTF_KFUNCS_END(bpf_sk_iter_kfunc_ids) static int tracing_iter_filter(const struct bpf_prog *prog, u32 kfunc_id) { diff --git a/net/core/xdp.c b/net/core/xdp.c index 4869c1c2d8f3d9..034fb80f3fbe9b 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -771,11 +771,11 @@ __bpf_kfunc int bpf_xdp_metadata_rx_vlan_tag(const struct xdp_md *ctx, __bpf_kfunc_end_defs(); -BTF_SET8_START(xdp_metadata_kfunc_ids) +BTF_KFUNCS_START(xdp_metadata_kfunc_ids) #define XDP_METADATA_KFUNC(_, __, name, ___) BTF_ID_FLAGS(func, name, KF_TRUSTED_ARGS) XDP_METADATA_KFUNC_xxx #undef XDP_METADATA_KFUNC -BTF_SET8_END(xdp_metadata_kfunc_ids) +BTF_KFUNCS_END(xdp_metadata_kfunc_ids) static const struct btf_kfunc_id_set xdp_metadata_kfunc_set = { .owner = THIS_MODULE, diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c index 834edc18463ac1..7f518ea5f4ac7d 100644 --- a/net/ipv4/bpf_tcp_ca.c +++ b/net/ipv4/bpf_tcp_ca.c @@ -201,13 +201,13 @@ bpf_tcp_ca_get_func_proto(enum bpf_func_id func_id, } } -BTF_SET8_START(bpf_tcp_ca_check_kfunc_ids) +BTF_KFUNCS_START(bpf_tcp_ca_check_kfunc_ids) BTF_ID_FLAGS(func, tcp_reno_ssthresh) BTF_ID_FLAGS(func, tcp_reno_cong_avoid) BTF_ID_FLAGS(func, tcp_reno_undo_cwnd) BTF_ID_FLAGS(func, tcp_slow_start) BTF_ID_FLAGS(func, tcp_cong_avoid_ai) -BTF_SET8_END(bpf_tcp_ca_check_kfunc_ids) +BTF_KFUNCS_END(bpf_tcp_ca_check_kfunc_ids) static const struct btf_kfunc_id_set bpf_tcp_ca_kfunc_set = { .owner = THIS_MODULE, diff --git a/net/ipv4/fou_bpf.c b/net/ipv4/fou_bpf.c index 4da03bf45c9b75..06e5572f296f1e 100644 --- a/net/ipv4/fou_bpf.c +++ b/net/ipv4/fou_bpf.c @@ -100,10 +100,10 @@ __bpf_kfunc int bpf_skb_get_fou_encap(struct __sk_buff *skb_ctx, __bpf_kfunc_end_defs(); -BTF_SET8_START(fou_kfunc_set) +BTF_KFUNCS_START(fou_kfunc_set) BTF_ID_FLAGS(func, bpf_skb_set_fou_encap) BTF_ID_FLAGS(func, bpf_skb_get_fou_encap) -BTF_SET8_END(fou_kfunc_set) +BTF_KFUNCS_END(fou_kfunc_set) static const struct btf_kfunc_id_set fou_bpf_kfunc_set = { .owner = THIS_MODULE, diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index 22358032dd484b..05dc2d05bc7cbb 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -1155,7 +1155,7 @@ static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = { .set_state = bbr_set_state, }; -BTF_SET8_START(tcp_bbr_check_kfunc_ids) +BTF_KFUNCS_START(tcp_bbr_check_kfunc_ids) #ifdef CONFIG_X86 #ifdef CONFIG_DYNAMIC_FTRACE BTF_ID_FLAGS(func, bbr_init) @@ -1168,7 +1168,7 @@ BTF_ID_FLAGS(func, bbr_min_tso_segs) BTF_ID_FLAGS(func, bbr_set_state) #endif #endif -BTF_SET8_END(tcp_bbr_check_kfunc_ids) +BTF_KFUNCS_END(tcp_bbr_check_kfunc_ids) static const struct btf_kfunc_id_set tcp_bbr_kfunc_set = { .owner = THIS_MODULE, diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 0fd78ecb67e756..44869ea089e346 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -485,7 +485,7 @@ static struct tcp_congestion_ops cubictcp __read_mostly = { .name = "cubic", }; -BTF_SET8_START(tcp_cubic_check_kfunc_ids) +BTF_KFUNCS_START(tcp_cubic_check_kfunc_ids) #ifdef CONFIG_X86 #ifdef CONFIG_DYNAMIC_FTRACE BTF_ID_FLAGS(func, cubictcp_init) @@ -496,7 +496,7 @@ BTF_ID_FLAGS(func, cubictcp_cwnd_event) BTF_ID_FLAGS(func, cubictcp_acked) #endif #endif -BTF_SET8_END(tcp_cubic_check_kfunc_ids) +BTF_KFUNCS_END(tcp_cubic_check_kfunc_ids) static const struct btf_kfunc_id_set tcp_cubic_kfunc_set = { .owner = THIS_MODULE, diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c index bb23bb5b387a0c..e33fbe4933e42f 100644 --- a/net/ipv4/tcp_dctcp.c +++ b/net/ipv4/tcp_dctcp.c @@ -260,7 +260,7 @@ static struct tcp_congestion_ops dctcp_reno __read_mostly = { .name = "dctcp-reno", }; -BTF_SET8_START(tcp_dctcp_check_kfunc_ids) +BTF_KFUNCS_START(tcp_dctcp_check_kfunc_ids) #ifdef CONFIG_X86 #ifdef CONFIG_DYNAMIC_FTRACE BTF_ID_FLAGS(func, dctcp_init) @@ -271,7 +271,7 @@ BTF_ID_FLAGS(func, dctcp_cwnd_undo) BTF_ID_FLAGS(func, dctcp_state) #endif #endif -BTF_SET8_END(tcp_dctcp_check_kfunc_ids) +BTF_KFUNCS_END(tcp_dctcp_check_kfunc_ids) static const struct btf_kfunc_id_set tcp_dctcp_kfunc_set = { .owner = THIS_MODULE, diff --git a/net/netfilter/nf_conntrack_bpf.c b/net/netfilter/nf_conntrack_bpf.c index 475358ec821296..d2492d050fe601 100644 --- a/net/netfilter/nf_conntrack_bpf.c +++ b/net/netfilter/nf_conntrack_bpf.c @@ -467,7 +467,7 @@ __bpf_kfunc int bpf_ct_change_status(struct nf_conn *nfct, u32 status) __bpf_kfunc_end_defs(); -BTF_SET8_START(nf_ct_kfunc_set) +BTF_KFUNCS_START(nf_ct_kfunc_set) BTF_ID_FLAGS(func, bpf_xdp_ct_alloc, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_xdp_ct_lookup, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_skb_ct_alloc, KF_ACQUIRE | KF_RET_NULL) @@ -478,7 +478,7 @@ BTF_ID_FLAGS(func, bpf_ct_set_timeout, KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, bpf_ct_change_timeout, KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, bpf_ct_set_status, KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, bpf_ct_change_status, KF_TRUSTED_ARGS) -BTF_SET8_END(nf_ct_kfunc_set) +BTF_KFUNCS_END(nf_ct_kfunc_set) static const struct btf_kfunc_id_set nf_conntrack_kfunc_set = { .owner = THIS_MODULE, diff --git a/net/netfilter/nf_nat_bpf.c b/net/netfilter/nf_nat_bpf.c index 6e3b2f58855fc0..481be15609b16a 100644 --- a/net/netfilter/nf_nat_bpf.c +++ b/net/netfilter/nf_nat_bpf.c @@ -54,9 +54,9 @@ __bpf_kfunc int bpf_ct_set_nat_info(struct nf_conn___init *nfct, __bpf_kfunc_end_defs(); -BTF_SET8_START(nf_nat_kfunc_set) +BTF_KFUNCS_START(nf_nat_kfunc_set) BTF_ID_FLAGS(func, bpf_ct_set_nat_info, KF_TRUSTED_ARGS) -BTF_SET8_END(nf_nat_kfunc_set) +BTF_KFUNCS_END(nf_nat_kfunc_set) static const struct btf_kfunc_id_set nf_bpf_nat_kfunc_set = { .owner = THIS_MODULE, diff --git a/net/xfrm/xfrm_interface_bpf.c b/net/xfrm/xfrm_interface_bpf.c index 7d5e920141e9b7..5ea15037ebd104 100644 --- a/net/xfrm/xfrm_interface_bpf.c +++ b/net/xfrm/xfrm_interface_bpf.c @@ -93,10 +93,10 @@ __bpf_kfunc int bpf_skb_set_xfrm_info(struct __sk_buff *skb_ctx, const struct bp __bpf_kfunc_end_defs(); -BTF_SET8_START(xfrm_ifc_kfunc_set) +BTF_KFUNCS_START(xfrm_ifc_kfunc_set) BTF_ID_FLAGS(func, bpf_skb_get_xfrm_info) BTF_ID_FLAGS(func, bpf_skb_set_xfrm_info) -BTF_SET8_END(xfrm_ifc_kfunc_set) +BTF_KFUNCS_END(xfrm_ifc_kfunc_set) static const struct btf_kfunc_id_set xfrm_interface_kfunc_set = { .owner = THIS_MODULE, diff --git a/net/xfrm/xfrm_state_bpf.c b/net/xfrm/xfrm_state_bpf.c index 9e20d4a377f7eb..2248eda741f8e0 100644 --- a/net/xfrm/xfrm_state_bpf.c +++ b/net/xfrm/xfrm_state_bpf.c @@ -117,10 +117,10 @@ __bpf_kfunc void bpf_xdp_xfrm_state_release(struct xfrm_state *x) __bpf_kfunc_end_defs(); -BTF_SET8_START(xfrm_state_kfunc_set) +BTF_KFUNCS_START(xfrm_state_kfunc_set) BTF_ID_FLAGS(func, bpf_xdp_get_xfrm_state, KF_RET_NULL | KF_ACQUIRE) BTF_ID_FLAGS(func, bpf_xdp_xfrm_state_release, KF_RELEASE) -BTF_SET8_END(xfrm_state_kfunc_set) +BTF_KFUNCS_END(xfrm_state_kfunc_set) static const struct btf_kfunc_id_set xfrm_state_xdp_kfunc_set = { .owner = THIS_MODULE, diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c index 6f163a0f1c94cc..4754c662b39ff8 100644 --- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c +++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c @@ -343,12 +343,12 @@ static struct bin_attribute bin_attr_bpf_testmod_file __ro_after_init = { .write = bpf_testmod_test_write, }; -BTF_SET8_START(bpf_testmod_common_kfunc_ids) +BTF_KFUNCS_START(bpf_testmod_common_kfunc_ids) BTF_ID_FLAGS(func, bpf_iter_testmod_seq_new, KF_ITER_NEW) BTF_ID_FLAGS(func, bpf_iter_testmod_seq_next, KF_ITER_NEXT | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_testmod_seq_destroy, KF_ITER_DESTROY) BTF_ID_FLAGS(func, bpf_kfunc_common_test) -BTF_SET8_END(bpf_testmod_common_kfunc_ids) +BTF_KFUNCS_END(bpf_testmod_common_kfunc_ids) static const struct btf_kfunc_id_set bpf_testmod_common_kfunc_set = { .owner = THIS_MODULE, @@ -494,7 +494,7 @@ __bpf_kfunc static u32 bpf_kfunc_call_test_static_unused_arg(u32 arg, u32 unused return arg; } -BTF_SET8_START(bpf_testmod_check_kfunc_ids) +BTF_KFUNCS_START(bpf_testmod_check_kfunc_ids) BTF_ID_FLAGS(func, bpf_testmod_test_mod_kfunc) BTF_ID_FLAGS(func, bpf_kfunc_call_test1) BTF_ID_FLAGS(func, bpf_kfunc_call_test2) @@ -520,7 +520,7 @@ BTF_ID_FLAGS(func, bpf_kfunc_call_test_ref, KF_TRUSTED_ARGS | KF_RCU) BTF_ID_FLAGS(func, bpf_kfunc_call_test_destructive, KF_DESTRUCTIVE) BTF_ID_FLAGS(func, bpf_kfunc_call_test_static_unused_arg) BTF_ID_FLAGS(func, bpf_kfunc_call_test_offset) -BTF_SET8_END(bpf_testmod_check_kfunc_ids) +BTF_KFUNCS_END(bpf_testmod_check_kfunc_ids) static int bpf_testmod_ops_init(struct btf *btf) { From 320d6fa71508ccfe5f9cd75561ab3d3919d9628c Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 31 Jan 2024 10:10:18 +0300 Subject: [PATCH 654/707] smb: client: Fix a NULL vs IS_ERR() check in wsl_set_xattrs() This was intended to be an IS_ERR() check. The ea_create_context() function doesn't return NULL. Fixes: 1eab17fe485c ("smb: client: add support for WSL reparse points") Reviewed-by: Paulo Alcantara Signed-off-by: Dan Carpenter Signed-off-by: Steve French --- fs/smb/client/reparse.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/smb/client/reparse.c b/fs/smb/client/reparse.c index 23e0cb62552ec7..84ff1683a40bcc 100644 --- a/fs/smb/client/reparse.c +++ b/fs/smb/client/reparse.c @@ -226,7 +226,7 @@ static int wsl_set_xattrs(struct inode *inode, umode_t _mode, } cc = ea_create_context(dlen, &cc_len); - if (!cc) + if (IS_ERR(cc)) return PTR_ERR(cc); ea = &cc->ea; From cf7f97cd68843d987cc7daca253f0bfbbbc535c1 Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Sun, 28 Jan 2024 01:12:01 -0300 Subject: [PATCH 655/707] smb: client: introduce SMB2_OP_QUERY_WSL_EA Add a new command to smb2_compound_op() for querying WSL extended attributes from reparse points. Signed-off-by: Paulo Alcantara Signed-off-by: Steve French --- fs/smb/client/cifsglob.h | 5 ++ fs/smb/client/reparse.c | 10 +-- fs/smb/client/smb2glob.h | 3 +- fs/smb/client/smb2inode.c | 170 +++++++++++++++++++++++++++++++++----- fs/smb/client/smb2pdu.h | 25 ++++++ fs/smb/client/trace.h | 2 + 6 files changed, 190 insertions(+), 25 deletions(-) diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index 90bc90ed97ef66..decf80131bbeb4 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -214,6 +214,10 @@ struct cifs_open_info_data { struct reparse_posix_data *posix; }; } reparse; + struct { + __u8 eas[SMB2_WSL_MAX_QUERY_EA_RESP_SIZE]; + unsigned int eas_len; + } wsl; char *symlink_target; struct cifs_sid posix_owner; struct cifs_sid posix_group; @@ -2292,6 +2296,7 @@ struct smb2_compound_vars { struct kvec close_iov; struct smb2_file_rename_info rename_info; struct smb2_file_link_info link_info; + struct kvec ea_iov; }; #endif /* _CIFS_GLOB_H */ diff --git a/fs/smb/client/reparse.c b/fs/smb/client/reparse.c index 84ff1683a40bcc..5f6674d8e098bd 100644 --- a/fs/smb/client/reparse.c +++ b/fs/smb/client/reparse.c @@ -201,15 +201,15 @@ static int wsl_set_xattrs(struct inode *inode, umode_t _mode, __le64 dev = cpu_to_le64(((u64)MINOR(_dev) << 32) | MAJOR(_dev)); __le64 mode = cpu_to_le64(_mode); struct wsl_xattr xattrs[] = { - { .name = "$LXUID", .value = uid, .size = 4, }, - { .name = "$LXGID", .value = gid, .size = 4, }, - { .name = "$LXMOD", .value = mode, .size = 4, }, - { .name = "$LXDEV", .value = dev, .size = 8, }, + { .name = SMB2_WSL_XATTR_UID, .value = uid, .size = SMB2_WSL_XATTR_UID_SIZE, }, + { .name = SMB2_WSL_XATTR_GID, .value = gid, .size = SMB2_WSL_XATTR_GID_SIZE, }, + { .name = SMB2_WSL_XATTR_MODE, .value = mode, .size = SMB2_WSL_XATTR_MODE_SIZE, }, + { .name = SMB2_WSL_XATTR_DEV, .value = dev, .size = SMB2_WSL_XATTR_DEV_SIZE, }, }; size_t cc_len; u32 dlen = 0, next = 0; int i, num_xattrs; - u8 name_size = strlen(xattrs[0].name) + 1; + u8 name_size = SMB2_WSL_XATTR_NAME_LEN + 1; memset(iov, 0, sizeof(*iov)); diff --git a/fs/smb/client/smb2glob.h b/fs/smb/client/smb2glob.h index a0c156996fc51e..2466e61551369c 100644 --- a/fs/smb/client/smb2glob.h +++ b/fs/smb/client/smb2glob.h @@ -36,7 +36,8 @@ enum smb2_compound_ops { SMB2_OP_RMDIR, SMB2_OP_POSIX_QUERY_INFO, SMB2_OP_SET_REPARSE, - SMB2_OP_GET_REPARSE + SMB2_OP_GET_REPARSE, + SMB2_OP_QUERY_WSL_EA, }; /* Used when constructing chained read requests. */ diff --git a/fs/smb/client/smb2inode.c b/fs/smb/client/smb2inode.c index e9955b964ea470..63485078a6df6c 100644 --- a/fs/smb/client/smb2inode.c +++ b/fs/smb/client/smb2inode.c @@ -85,6 +85,82 @@ static int parse_posix_sids(struct cifs_open_info_data *data, return 0; } +struct wsl_query_ea { + __le32 next; + __u8 name_len; + __u8 name[SMB2_WSL_XATTR_NAME_LEN + 1]; +} __packed; + +#define NEXT_OFF cpu_to_le32(sizeof(struct wsl_query_ea)) + +static const struct wsl_query_ea wsl_query_eas[] = { + { .next = NEXT_OFF, .name_len = SMB2_WSL_XATTR_NAME_LEN, .name = SMB2_WSL_XATTR_UID, }, + { .next = NEXT_OFF, .name_len = SMB2_WSL_XATTR_NAME_LEN, .name = SMB2_WSL_XATTR_GID, }, + { .next = NEXT_OFF, .name_len = SMB2_WSL_XATTR_NAME_LEN, .name = SMB2_WSL_XATTR_MODE, }, + { .next = 0, .name_len = SMB2_WSL_XATTR_NAME_LEN, .name = SMB2_WSL_XATTR_DEV, }, +}; + +static int check_wsl_eas(struct kvec *rsp_iov) +{ + struct smb2_file_full_ea_info *ea; + struct smb2_query_info_rsp *rsp = rsp_iov->iov_base; + unsigned long addr; + u32 outlen, next; + u8 nlen; + u16 vlen; + u8 *end; + + outlen = le32_to_cpu(rsp->OutputBufferLength); + if (outlen < SMB2_WSL_MIN_QUERY_EA_RESP_SIZE || + outlen > SMB2_WSL_MAX_QUERY_EA_RESP_SIZE) + return -EINVAL; + + ea = (void *)((u8 *)rsp_iov->iov_base + + le16_to_cpu(rsp->OutputBufferOffset)); + end = (u8 *)rsp_iov->iov_base + rsp_iov->iov_len; + for (;;) { + if ((u8 *)ea > end - sizeof(*ea)) + return -EINVAL; + + nlen = ea->ea_name_length; + vlen = le16_to_cpu(ea->ea_value_length); + if (nlen != SMB2_WSL_XATTR_NAME_LEN || + (u8 *)ea + nlen + 1 + vlen > end) + return -EINVAL; + + switch (vlen) { + case 4: + if (strncmp(ea->ea_data, SMB2_WSL_XATTR_UID, nlen) && + strncmp(ea->ea_data, SMB2_WSL_XATTR_GID, nlen) && + strncmp(ea->ea_data, SMB2_WSL_XATTR_MODE, nlen)) + return -EINVAL; + break; + case 8: + if (strncmp(ea->ea_data, SMB2_WSL_XATTR_DEV, nlen)) + return -EINVAL; + break; + case 0: + if (!strncmp(ea->ea_data, SMB2_WSL_XATTR_UID, nlen) || + !strncmp(ea->ea_data, SMB2_WSL_XATTR_GID, nlen) || + !strncmp(ea->ea_data, SMB2_WSL_XATTR_MODE, nlen) || + !strncmp(ea->ea_data, SMB2_WSL_XATTR_DEV, nlen)) + break; + fallthrough; + default: + return -EINVAL; + } + + next = le32_to_cpu(ea->next_entry_offset); + if (!next) + break; + if (!IS_ALIGNED(next, 4) || + check_add_overflow((unsigned long)ea, next, &addr)) + return -EINVAL; + ea = (void *)addr; + } + return 0; +} + /* * note: If cfile is passed, the reference to it is dropped here. * So make sure that you do not reuse cfile after return from this func. @@ -118,7 +194,7 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, __u8 delete_pending[8] = {1, 0, 0, 0, 0, 0, 0, 0}; unsigned int size[2]; void *data[2]; - int len; + unsigned int len; int retries = 0, cur_sleep = 1; replay_again: @@ -455,6 +531,39 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, trace_smb3_get_reparse_compound_enter(xid, ses->Suid, tcon->tid, full_path); break; + case SMB2_OP_QUERY_WSL_EA: + rqst[num_rqst].rq_iov = &vars->ea_iov; + rqst[num_rqst].rq_nvec = 1; + + if (cfile) { + rc = SMB2_query_info_init(tcon, server, + &rqst[num_rqst], + cfile->fid.persistent_fid, + cfile->fid.volatile_fid, + FILE_FULL_EA_INFORMATION, + SMB2_O_INFO_FILE, 0, + SMB2_WSL_MAX_QUERY_EA_RESP_SIZE, + sizeof(wsl_query_eas), + (void *)wsl_query_eas); + } else { + rc = SMB2_query_info_init(tcon, server, + &rqst[num_rqst], + COMPOUND_FID, + COMPOUND_FID, + FILE_FULL_EA_INFORMATION, + SMB2_O_INFO_FILE, 0, + SMB2_WSL_MAX_QUERY_EA_RESP_SIZE, + sizeof(wsl_query_eas), + (void *)wsl_query_eas); + } + if (!rc && (!cfile || num_rqst > 1)) { + smb2_set_next_command(tcon, &rqst[num_rqst]); + smb2_set_related(&rqst[num_rqst]); + } else if (rc) { + goto finished; + } + num_rqst++; + break; default: cifs_dbg(VFS, "Invalid command\n"); rc = -EINVAL; @@ -637,11 +746,32 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, memset(iov, 0, sizeof(*iov)); resp_buftype[i + 1] = CIFS_NO_BUFFER; } else { - trace_smb3_set_reparse_compound_err(xid, ses->Suid, + trace_smb3_set_reparse_compound_err(xid, ses->Suid, tcon->tid, rc); } SMB2_ioctl_free(&rqst[num_rqst++]); break; + case SMB2_OP_QUERY_WSL_EA: + if (!rc) { + idata = in_iov[i].iov_base; + qi_rsp = rsp_iov[i + 1].iov_base; + data[0] = (u8 *)qi_rsp + le16_to_cpu(qi_rsp->OutputBufferOffset); + size[0] = le32_to_cpu(qi_rsp->OutputBufferLength); + rc = check_wsl_eas(&rsp_iov[i + 1]); + if (!rc) { + memcpy(idata->wsl.eas, data[0], size[0]); + idata->wsl.eas_len = size[0]; + } + } + if (!rc) { + trace_smb3_query_wsl_ea_compound_done(xid, ses->Suid, + tcon->tid); + } else { + trace_smb3_query_wsl_ea_compound_err(xid, ses->Suid, + tcon->tid, rc); + } + SMB2_query_info_free(&rqst[num_rqst++]); + break; } } SMB2_close_free(&rqst[num_rqst]); @@ -709,11 +839,11 @@ int smb2_query_path_info(const unsigned int xid, struct cifsFileInfo *cfile; struct cached_fid *cfid = NULL; struct smb2_hdr *hdr; - struct kvec in_iov[2], out_iov[3] = {}; + struct kvec in_iov[3], out_iov[3] = {}; int out_buftype[3] = {}; - int cmds[2]; + int cmds[3]; bool islink; - int i, num_cmds; + int i, num_cmds = 0; int rc, rc2; data->adjust_tz = false; @@ -746,21 +876,22 @@ int smb2_query_path_info(const unsigned int xid, close_cached_dir(cfid); return rc; } - cmds[0] = SMB2_OP_QUERY_INFO; + cmds[num_cmds++] = SMB2_OP_QUERY_INFO; } else { - cmds[0] = SMB2_OP_POSIX_QUERY_INFO; + cmds[num_cmds++] = SMB2_OP_POSIX_QUERY_INFO; } in_iov[0].iov_base = data; in_iov[0].iov_len = sizeof(*data); in_iov[1] = in_iov[0]; + in_iov[2] = in_iov[0]; cifs_get_readable_path(tcon, full_path, &cfile); oparms = CIFS_OPARMS(cifs_sb, tcon, full_path, FILE_READ_ATTRIBUTES, FILE_OPEN, create_options, ACL_NO_MODE); rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, - &oparms, in_iov, cmds, 1, cfile, - out_iov, out_buftype); + &oparms, in_iov, cmds, num_cmds, + cfile, out_iov, out_buftype); hdr = out_iov[0].iov_base; /* * If first iov is unset, then SMB session was dropped or we've got a @@ -780,17 +911,18 @@ int smb2_query_path_info(const unsigned int xid, if (rc || !data->reparse_point) goto out; - if (data->reparse.tag == IO_REPARSE_TAG_SYMLINK) { - /* symlink already parsed in create response */ - num_cmds = 1; - } else { - cmds[1] = SMB2_OP_GET_REPARSE; - num_cmds = 2; - } + cmds[num_cmds++] = SMB2_OP_QUERY_WSL_EA; + /* + * Skip SMB2_OP_GET_REPARSE if symlink already parsed in create + * response. + */ + if (data->reparse.tag != IO_REPARSE_TAG_SYMLINK) + cmds[num_cmds++] = SMB2_OP_GET_REPARSE; + oparms = CIFS_OPARMS(cifs_sb, tcon, full_path, - FILE_READ_ATTRIBUTES, FILE_OPEN, - create_options | OPEN_REPARSE_POINT, - ACL_NO_MODE); + FILE_READ_ATTRIBUTES | FILE_READ_EA, + FILE_OPEN, create_options | + OPEN_REPARSE_POINT, ACL_NO_MODE); cifs_get_readable_path(tcon, full_path, &cfile); rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, &oparms, in_iov, cmds, num_cmds, diff --git a/fs/smb/client/smb2pdu.h b/fs/smb/client/smb2pdu.h index ea63d33e455322..c72a3b2886b7ff 100644 --- a/fs/smb/client/smb2pdu.h +++ b/fs/smb/client/smb2pdu.h @@ -420,4 +420,29 @@ struct smb2_create_ea_ctx { struct smb2_file_full_ea_info ea; } __packed; +#define SMB2_WSL_XATTR_UID "$LXUID" +#define SMB2_WSL_XATTR_GID "$LXGID" +#define SMB2_WSL_XATTR_MODE "$LXMOD" +#define SMB2_WSL_XATTR_DEV "$LXDEV" +#define SMB2_WSL_XATTR_NAME_LEN 6 +#define SMB2_WSL_NUM_XATTRS 4 + +#define SMB2_WSL_XATTR_UID_SIZE 4 +#define SMB2_WSL_XATTR_GID_SIZE 4 +#define SMB2_WSL_XATTR_MODE_SIZE 4 +#define SMB2_WSL_XATTR_DEV_SIZE 8 + +#define SMB2_WSL_MIN_QUERY_EA_RESP_SIZE \ + (ALIGN((SMB2_WSL_NUM_XATTRS - 1) * \ + (SMB2_WSL_XATTR_NAME_LEN + 1 + \ + sizeof(struct smb2_file_full_ea_info)), 4) + \ + SMB2_WSL_XATTR_NAME_LEN + 1 + sizeof(struct smb2_file_full_ea_info)) + +#define SMB2_WSL_MAX_QUERY_EA_RESP_SIZE \ + (ALIGN(SMB2_WSL_MIN_QUERY_EA_RESP_SIZE + \ + SMB2_WSL_XATTR_UID_SIZE + \ + SMB2_WSL_XATTR_GID_SIZE + \ + SMB2_WSL_XATTR_MODE_SIZE + \ + SMB2_WSL_XATTR_DEV_SIZE, 4)) + #endif /* _SMB2PDU_H */ diff --git a/fs/smb/client/trace.h b/fs/smb/client/trace.h index 522fa387fcfd72..ce90ae0d77f849 100644 --- a/fs/smb/client/trace.h +++ b/fs/smb/client/trace.h @@ -411,6 +411,7 @@ DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(set_eof_done); DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(set_info_compound_done); DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(set_reparse_compound_done); DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(get_reparse_compound_done); +DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(query_wsl_ea_compound_done); DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(delete_done); DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(mkdir_done); DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(tdis_done); @@ -456,6 +457,7 @@ DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(set_eof_err); DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(set_info_compound_err); DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(set_reparse_compound_err); DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(get_reparse_compound_err); +DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(query_wsl_ea_compound_err); DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(mkdir_err); DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(delete_err); DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(tdis_err); From 2417900a8dcec30415ba9318f02d4346a158c34b Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Sun, 28 Jan 2024 21:52:03 -0300 Subject: [PATCH 656/707] smb: client: parse uid, gid, mode and dev from WSL reparse points Parse the extended attributes from WSL reparse points to correctly report uid, gid mode and dev from ther instantiated inodes. Signed-off-by: Paulo Alcantara Signed-off-by: Steve French --- fs/smb/client/inode.c | 5 ++- fs/smb/client/readdir.c | 2 ++ fs/smb/client/reparse.c | 78 +++++++++++++++++++++++++++++++++-------- fs/smb/client/reparse.h | 29 +++++++++++++++ 4 files changed, 97 insertions(+), 17 deletions(-) diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c index 56d77ff8249d50..24489e1e238ad1 100644 --- a/fs/smb/client/inode.c +++ b/fs/smb/client/inode.c @@ -758,6 +758,8 @@ static void cifs_open_info_to_fattr(struct cifs_fattr *fattr, fattr->cf_bytes = le64_to_cpu(info->AllocationSize); fattr->cf_createtime = le64_to_cpu(info->CreationTime); fattr->cf_nlink = le32_to_cpu(info->NumberOfLinks); + fattr->cf_uid = cifs_sb->ctx->linux_uid; + fattr->cf_gid = cifs_sb->ctx->linux_gid; fattr->cf_mode = cifs_sb->ctx->file_mode; if (cifs_open_data_reparse(data) && @@ -800,9 +802,6 @@ static void cifs_open_info_to_fattr(struct cifs_fattr *fattr, fattr->cf_symlink_target = data->symlink_target; data->symlink_target = NULL; } - - fattr->cf_uid = cifs_sb->ctx->linux_uid; - fattr->cf_gid = cifs_sb->ctx->linux_gid; } static int diff --git a/fs/smb/client/readdir.c b/fs/smb/client/readdir.c index 7ef5a4b37901db..80508a1af1a4d9 100644 --- a/fs/smb/client/readdir.c +++ b/fs/smb/client/readdir.c @@ -125,6 +125,8 @@ cifs_prime_dcache(struct dentry *parent, struct qstr *name, if (likely(reparse_inode_match(inode, fattr))) { fattr->cf_mode = inode->i_mode; fattr->cf_rdev = inode->i_rdev; + fattr->cf_uid = inode->i_uid; + fattr->cf_gid = inode->i_gid; fattr->cf_eof = CIFS_I(inode)->netfs.remote_i_size; fattr->cf_symlink_target = NULL; } else { diff --git a/fs/smb/client/reparse.c b/fs/smb/client/reparse.c index 5f6674d8e098bd..6ed0ea27327050 100644 --- a/fs/smb/client/reparse.c +++ b/fs/smb/client/reparse.c @@ -254,7 +254,9 @@ static int mknod_wsl(unsigned int xid, struct inode *inode, { struct cifs_open_info_data data; struct reparse_data_buffer buf; + struct smb2_create_ea_ctx *cc; struct inode *new; + unsigned int len; struct kvec reparse_iov, xattr_iov; int rc; @@ -271,6 +273,11 @@ static int mknod_wsl(unsigned int xid, struct inode *inode, .reparse = { .tag = le32_to_cpu(buf.ReparseTag), .buf = &buf, }, }; + cc = xattr_iov.iov_base; + len = le32_to_cpu(cc->ctx.DataLength); + memcpy(data.wsl.eas, &cc->ea, len); + data.wsl.eas_len = len; + new = smb2_get_reparse_inode(&data, inode->i_sb, xid, tcon, full_path, &reparse_iov, &xattr_iov); @@ -404,6 +411,62 @@ int smb2_parse_reparse_point(struct cifs_sb_info *cifs_sb, return parse_reparse_point(buf, plen, cifs_sb, true, data); } +static void wsl_to_fattr(struct cifs_open_info_data *data, + struct cifs_sb_info *cifs_sb, + u32 tag, struct cifs_fattr *fattr) +{ + struct smb2_file_full_ea_info *ea; + u32 next = 0; + + switch (tag) { + case IO_REPARSE_TAG_LX_SYMLINK: + fattr->cf_mode |= S_IFLNK; + break; + case IO_REPARSE_TAG_LX_FIFO: + fattr->cf_mode |= S_IFIFO; + break; + case IO_REPARSE_TAG_AF_UNIX: + fattr->cf_mode |= S_IFSOCK; + break; + case IO_REPARSE_TAG_LX_CHR: + fattr->cf_mode |= S_IFCHR; + break; + case IO_REPARSE_TAG_LX_BLK: + fattr->cf_mode |= S_IFBLK; + break; + } + + if (!data->wsl.eas_len) + goto out; + + ea = (struct smb2_file_full_ea_info *)data->wsl.eas; + do { + const char *name; + void *v; + u8 nlen; + + ea = (void *)((u8 *)ea + next); + next = le32_to_cpu(ea->next_entry_offset); + if (!le16_to_cpu(ea->ea_value_length)) + continue; + + name = ea->ea_data; + nlen = ea->ea_name_length; + v = (void *)((u8 *)ea->ea_data + ea->ea_name_length + 1); + + if (!strncmp(name, SMB2_WSL_XATTR_UID, nlen)) + fattr->cf_uid = wsl_make_kuid(cifs_sb, v); + else if (!strncmp(name, SMB2_WSL_XATTR_GID, nlen)) + fattr->cf_gid = wsl_make_kgid(cifs_sb, v); + else if (!strncmp(name, SMB2_WSL_XATTR_MODE, nlen)) + fattr->cf_mode = (umode_t)le32_to_cpu(*(__le32 *)v); + else if (!strncmp(name, SMB2_WSL_XATTR_DEV, nlen)) + fattr->cf_rdev = wsl_mkdev(v); + } while (next); +out: + fattr->cf_dtype = S_DT(fattr->cf_mode); +} + bool cifs_reparse_point_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr, struct cifs_open_info_data *data) @@ -444,24 +507,11 @@ bool cifs_reparse_point_to_fattr(struct cifs_sb_info *cifs_sb, switch (tag) { case IO_REPARSE_TAG_LX_SYMLINK: - fattr->cf_mode |= S_IFLNK; - fattr->cf_dtype = DT_LNK; - break; case IO_REPARSE_TAG_LX_FIFO: - fattr->cf_mode |= S_IFIFO; - fattr->cf_dtype = DT_FIFO; - break; case IO_REPARSE_TAG_AF_UNIX: - fattr->cf_mode |= S_IFSOCK; - fattr->cf_dtype = DT_SOCK; - break; case IO_REPARSE_TAG_LX_CHR: - fattr->cf_mode |= S_IFCHR; - fattr->cf_dtype = DT_CHR; - break; case IO_REPARSE_TAG_LX_BLK: - fattr->cf_mode |= S_IFBLK; - fattr->cf_dtype = DT_BLK; + wsl_to_fattr(data, cifs_sb, tag, fattr); break; case 0: /* SMB1 symlink */ case IO_REPARSE_TAG_SYMLINK: diff --git a/fs/smb/client/reparse.h b/fs/smb/client/reparse.h index 9816bac9855257..6b55d1df9e2f84 100644 --- a/fs/smb/client/reparse.h +++ b/fs/smb/client/reparse.h @@ -8,6 +8,8 @@ #include #include +#include +#include "fs_context.h" #include "cifsglob.h" static inline dev_t reparse_nfs_mkdev(struct reparse_posix_data *buf) @@ -17,6 +19,33 @@ static inline dev_t reparse_nfs_mkdev(struct reparse_posix_data *buf) return MKDEV(v >> 32, v & 0xffffffff); } +static inline dev_t wsl_mkdev(void *ptr) +{ + u64 v = le64_to_cpu(*(__le64 *)ptr); + + return MKDEV(v & 0xffffffff, v >> 32); +} + +static inline kuid_t wsl_make_kuid(struct cifs_sb_info *cifs_sb, + void *ptr) +{ + u32 uid = le32_to_cpu(*(__le32 *)ptr); + + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID) + return cifs_sb->ctx->linux_uid; + return make_kuid(current_user_ns(), uid); +} + +static inline kgid_t wsl_make_kgid(struct cifs_sb_info *cifs_sb, + void *ptr) +{ + u32 gid = le32_to_cpu(*(__le32 *)ptr); + + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID) + return cifs_sb->ctx->linux_gid; + return make_kgid(current_user_ns(), gid); +} + static inline u64 reparse_mode_nfs_type(mode_t mode) { switch (mode & S_IFMT) { From 1b5af823d703ee183ffdde188aaf584ab93eea19 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 3 Jan 2024 11:26:49 +0100 Subject: [PATCH 657/707] soc/tegra: fix build failure on Tegra241 If all the other SoCs are disabled, the driver fails to build: drivers/soc/tegra/fuse/fuse-tegra30.c:684:17: error: 'tegra30_fuse_read' undeclared here (not in a function); did you mean 'tegra_fuse_readl'? 684 | .read = tegra30_fuse_read, | ^~~~~~~~~~~~~~~~~ | tegra_fuse_readl drivers/soc/tegra/fuse/fuse-tegra30.c:694:17: error: 'tegra30_fuse_init' undeclared here (not in a function); did you mean 'tegra_fuse_info'? 694 | .init = tegra30_fuse_init, | ^~~~~~~~~~~~~~~~~ Fix the list of SoCs using this function to include the newly added one. Fixes: dee509eb9cd5 ("soc/tegra: fuse: Add support for Tegra241") Reviewed-by: Jon Hunter Reviewed-by: Kartik Signed-off-by: Arnd Bergmann --- drivers/soc/tegra/fuse/fuse-tegra30.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/soc/tegra/fuse/fuse-tegra30.c b/drivers/soc/tegra/fuse/fuse-tegra30.c index e94d46372a6396..92ac5693382637 100644 --- a/drivers/soc/tegra/fuse/fuse-tegra30.c +++ b/drivers/soc/tegra/fuse/fuse-tegra30.c @@ -38,7 +38,8 @@ defined(CONFIG_ARCH_TEGRA_210_SOC) || \ defined(CONFIG_ARCH_TEGRA_186_SOC) || \ defined(CONFIG_ARCH_TEGRA_194_SOC) || \ - defined(CONFIG_ARCH_TEGRA_234_SOC) + defined(CONFIG_ARCH_TEGRA_234_SOC) || \ + defined(CONFIG_ARCH_TEGRA_241_SOC) static u32 tegra30_fuse_read_early(struct tegra_fuse *fuse, unsigned int offset) { if (WARN_ON(!fuse->base)) From 4eacc39d5529cb0bb6bd9a6608658703d7af1e05 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Wed, 31 Jan 2024 21:57:27 +0100 Subject: [PATCH 658/707] dm-crypt, dm-verity: disable tasklets Tasklets have an inherent problem with memory corruption. The function tasklet_action_common calls tasklet_trylock, then it calls the tasklet callback and then it calls tasklet_unlock. If the tasklet callback frees the structure that contains the tasklet or if it calls some code that may free it, tasklet_unlock will write into free memory. The commits 8e14f610159d and d9a02e016aaf try to fix it for dm-crypt, but it is not a sufficient fix and the data corruption can still happen [1]. There is no fix for dm-verity and dm-verity will write into free memory with every tasklet-processed bio. There will be atomic workqueues implemented in the kernel 6.9 [2]. They will have better interface and they will not suffer from the memory corruption problem. But we need something that stops the memory corruption now and that can be backported to the stable kernels. So, I'm proposing this commit that disables tasklets in both dm-crypt and dm-verity. This commit doesn't remove the tasklet support, because the tasklet code will be reused when atomic workqueues will be implemented. [1] https://lore.kernel.org/all/d390d7ee-f142-44d3-822a-87949e14608b@suse.de/T/ [2] https://lore.kernel.org/lkml/20240130091300.2968534-1-tj@kernel.org/ Signed-off-by: Mikulas Patocka Cc: stable@vger.kernel.org Fixes: 39d42fa96ba1b ("dm crypt: add flags to optionally bypass kcryptd workqueues") Fixes: 5721d4e5a9cdb ("dm verity: Add optional "try_verify_in_tasklet" feature") Signed-off-by: Mike Snitzer --- drivers/md/dm-crypt.c | 38 ++--------------------------------- drivers/md/dm-verity-target.c | 27 ++----------------------- 2 files changed, 4 insertions(+), 61 deletions(-) diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 855b482cbff1f0..f745f85082434d 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -73,10 +73,8 @@ struct dm_crypt_io { struct bio *base_bio; u8 *integrity_metadata; bool integrity_metadata_from_pool:1; - bool in_tasklet:1; struct work_struct work; - struct tasklet_struct tasklet; struct convert_context ctx; @@ -1762,7 +1760,6 @@ static void crypt_io_init(struct dm_crypt_io *io, struct crypt_config *cc, io->ctx.r.req = NULL; io->integrity_metadata = NULL; io->integrity_metadata_from_pool = false; - io->in_tasklet = false; atomic_set(&io->io_pending, 0); } @@ -1771,13 +1768,6 @@ static void crypt_inc_pending(struct dm_crypt_io *io) atomic_inc(&io->io_pending); } -static void kcryptd_io_bio_endio(struct work_struct *work) -{ - struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work); - - bio_endio(io->base_bio); -} - /* * One of the bios was finished. Check for completion of * the whole request and correctly clean up the buffer. @@ -1801,20 +1791,6 @@ static void crypt_dec_pending(struct dm_crypt_io *io) base_bio->bi_status = error; - /* - * If we are running this function from our tasklet, - * we can't call bio_endio() here, because it will call - * clone_endio() from dm.c, which in turn will - * free the current struct dm_crypt_io structure with - * our tasklet. In this case we need to delay bio_endio() - * execution to after the tasklet is done and dequeued. - */ - if (io->in_tasklet) { - INIT_WORK(&io->work, kcryptd_io_bio_endio); - queue_work(cc->io_queue, &io->work); - return; - } - bio_endio(base_bio); } @@ -2246,11 +2222,6 @@ static void kcryptd_crypt(struct work_struct *work) kcryptd_crypt_write_convert(io); } -static void kcryptd_crypt_tasklet(unsigned long work) -{ - kcryptd_crypt((struct work_struct *)work); -} - static void kcryptd_queue_crypt(struct dm_crypt_io *io) { struct crypt_config *cc = io->cc; @@ -2262,15 +2233,10 @@ static void kcryptd_queue_crypt(struct dm_crypt_io *io) * irqs_disabled(): the kernel may run some IO completion from the idle thread, but * it is being executed with irqs disabled. */ - if (in_hardirq() || irqs_disabled()) { - io->in_tasklet = true; - tasklet_init(&io->tasklet, kcryptd_crypt_tasklet, (unsigned long)&io->work); - tasklet_schedule(&io->tasklet); + if (!(in_hardirq() || irqs_disabled())) { + kcryptd_crypt(&io->work); return; } - - kcryptd_crypt(&io->work); - return; } INIT_WORK(&io->work, kcryptd_crypt); diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index 14e58ae705218f..a8c7dab1719524 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -21,7 +21,6 @@ #include #include #include -#include #define DM_MSG_PREFIX "verity" @@ -645,23 +644,6 @@ static void verity_work(struct work_struct *w) verity_finish_io(io, errno_to_blk_status(verity_verify_io(io))); } -static void verity_tasklet(unsigned long data) -{ - struct dm_verity_io *io = (struct dm_verity_io *)data; - int err; - - io->in_tasklet = true; - err = verity_verify_io(io); - if (err == -EAGAIN || err == -ENOMEM) { - /* fallback to retrying with work-queue */ - INIT_WORK(&io->work, verity_work); - queue_work(io->v->verify_wq, &io->work); - return; - } - - verity_finish_io(io, errno_to_blk_status(err)); -} - static void verity_end_io(struct bio *bio) { struct dm_verity_io *io = bio->bi_private; @@ -674,13 +656,8 @@ static void verity_end_io(struct bio *bio) return; } - if (static_branch_unlikely(&use_tasklet_enabled) && io->v->use_tasklet) { - tasklet_init(&io->tasklet, verity_tasklet, (unsigned long)io); - tasklet_schedule(&io->tasklet); - } else { - INIT_WORK(&io->work, verity_work); - queue_work(io->v->verify_wq, &io->work); - } + INIT_WORK(&io->work, verity_work); + queue_work(io->v->verify_wq, &io->work); } /* From c759bb3394ab095d87beca6f75e172ec87571a5c Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Thu, 1 Feb 2024 13:03:05 +1100 Subject: [PATCH 659/707] Revert "kasan: revert eviction of stack traces in generic mode" This reverts commit 2ef8127ce56d76a4f861e7f0c40e241409c18087. --- mm/kasan/common.c | 8 +++-- mm/kasan/generic.c | 68 ++++++++++++++++++++++++++++++++++++++----- mm/kasan/kasan.h | 10 +++++++ mm/kasan/quarantine.c | 5 +--- 4 files changed, 77 insertions(+), 14 deletions(-) diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 6ca63e8dda741b..610efae9122094 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -65,7 +65,8 @@ void kasan_save_track(struct kasan_track *track, gfp_t flags) { depot_stack_handle_t stack; - stack = kasan_save_stack(flags, STACK_DEPOT_FLAG_CAN_ALLOC); + stack = kasan_save_stack(flags, + STACK_DEPOT_FLAG_CAN_ALLOC | STACK_DEPOT_FLAG_GET); kasan_set_track(track, stack); } @@ -265,9 +266,10 @@ bool __kasan_slab_free(struct kmem_cache *cache, void *object, return true; /* - * Note: Keep per-object metadata to allow KASAN print stack traces for - * use-after-free-before-realloc bugs. + * If the object is not put into quarantine, it will likely be quickly + * reallocated. Thus, release its metadata now. */ + kasan_release_object_meta(cache, object); /* Let slab put the object onto the freelist. */ return false; diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index fc9cf1860efb34..df6627f62402c0 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -485,6 +485,16 @@ void kasan_init_object_meta(struct kmem_cache *cache, const void *object) if (alloc_meta) { /* Zero out alloc meta to mark it as invalid. */ __memset(alloc_meta, 0, sizeof(*alloc_meta)); + + /* + * Prepare the lock for saving auxiliary stack traces. + * Temporarily disable KASAN bug reporting to allow instrumented + * raw_spin_lock_init to access aux_lock, which resides inside + * of a redzone. + */ + kasan_disable_current(); + raw_spin_lock_init(&alloc_meta->aux_lock); + kasan_enable_current(); } /* @@ -496,8 +506,18 @@ void kasan_init_object_meta(struct kmem_cache *cache, const void *object) static void release_alloc_meta(struct kasan_alloc_meta *meta) { - /* Zero out alloc meta to mark it as invalid. */ - __memset(meta, 0, sizeof(*meta)); + /* Evict the stack traces from stack depot. */ + stack_depot_put(meta->alloc_track.stack); + stack_depot_put(meta->aux_stack[0]); + stack_depot_put(meta->aux_stack[1]); + + /* + * Zero out alloc meta to mark it as invalid but keep aux_lock + * initialized to avoid having to reinitialize it when another object + * is allocated in the same slot. + */ + __memset(&meta->alloc_track, 0, sizeof(meta->alloc_track)); + __memset(meta->aux_stack, 0, sizeof(meta->aux_stack)); } static void release_free_meta(const void *object, struct kasan_free_meta *meta) @@ -506,10 +526,27 @@ static void release_free_meta(const void *object, struct kasan_free_meta *meta) if (*(u8 *)kasan_mem_to_shadow(object) != KASAN_SLAB_FREE_META) return; + /* Evict the stack trace from the stack depot. */ + stack_depot_put(meta->free_track.stack); + /* Mark free meta as invalid. */ *(u8 *)kasan_mem_to_shadow(object) = KASAN_SLAB_FREE; } +void kasan_release_object_meta(struct kmem_cache *cache, const void *object) +{ + struct kasan_alloc_meta *alloc_meta; + struct kasan_free_meta *free_meta; + + alloc_meta = kasan_get_alloc_meta(cache, object); + if (alloc_meta) + release_alloc_meta(alloc_meta); + + free_meta = kasan_get_free_meta(cache, object); + if (free_meta) + release_free_meta(object, free_meta); +} + size_t kasan_metadata_size(struct kmem_cache *cache, bool in_object) { struct kasan_cache *info = &cache->kasan_info; @@ -534,6 +571,8 @@ static void __kasan_record_aux_stack(void *addr, depot_flags_t depot_flags) struct kmem_cache *cache; struct kasan_alloc_meta *alloc_meta; void *object; + depot_stack_handle_t new_handle, old_handle; + unsigned long flags; if (is_kfence_address(addr) || !slab) return; @@ -544,18 +583,33 @@ static void __kasan_record_aux_stack(void *addr, depot_flags_t depot_flags) if (!alloc_meta) return; + new_handle = kasan_save_stack(0, depot_flags); + + /* + * Temporarily disable KASAN bug reporting to allow instrumented + * spinlock functions to access aux_lock, which resides inside of a + * redzone. + */ + kasan_disable_current(); + raw_spin_lock_irqsave(&alloc_meta->aux_lock, flags); + old_handle = alloc_meta->aux_stack[1]; alloc_meta->aux_stack[1] = alloc_meta->aux_stack[0]; - alloc_meta->aux_stack[0] = kasan_save_stack(0, depot_flags); + alloc_meta->aux_stack[0] = new_handle; + raw_spin_unlock_irqrestore(&alloc_meta->aux_lock, flags); + kasan_enable_current(); + + stack_depot_put(old_handle); } void kasan_record_aux_stack(void *addr) { - return __kasan_record_aux_stack(addr, STACK_DEPOT_FLAG_CAN_ALLOC); + return __kasan_record_aux_stack(addr, + STACK_DEPOT_FLAG_CAN_ALLOC | STACK_DEPOT_FLAG_GET); } void kasan_record_aux_stack_noalloc(void *addr) { - return __kasan_record_aux_stack(addr, 0); + return __kasan_record_aux_stack(addr, STACK_DEPOT_FLAG_GET); } void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags) @@ -566,7 +620,7 @@ void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags) if (!alloc_meta) return; - /* Invalidate previous stack traces (might exist for krealloc or mempool). */ + /* Evict previous stack traces (might exist for krealloc or mempool). */ release_alloc_meta(alloc_meta); kasan_save_track(&alloc_meta->alloc_track, flags); @@ -580,7 +634,7 @@ void kasan_save_free_info(struct kmem_cache *cache, void *object) if (!free_meta) return; - /* Invalidate previous stack trace (might exist for mempool). */ + /* Evict previous stack trace (might exist for mempool). */ release_free_meta(object, free_meta); kasan_save_track(&free_meta->free_track, 0); diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index fb2b9ac0659a7a..d0f172f2b9783f 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -6,6 +6,7 @@ #include #include #include +#include #include #if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS) @@ -264,6 +265,13 @@ struct kasan_global { struct kasan_alloc_meta { struct kasan_track alloc_track; /* Free track is stored in kasan_free_meta. */ + /* + * aux_lock protects aux_stack from accesses from concurrent + * kasan_record_aux_stack calls. It is a raw spinlock to avoid sleeping + * on RT kernels, as kasan_record_aux_stack_noalloc can be called from + * non-sleepable contexts. + */ + raw_spinlock_t aux_lock; depot_stack_handle_t aux_stack[2]; }; @@ -390,8 +398,10 @@ struct kasan_alloc_meta *kasan_get_alloc_meta(struct kmem_cache *cache, struct kasan_free_meta *kasan_get_free_meta(struct kmem_cache *cache, const void *object); void kasan_init_object_meta(struct kmem_cache *cache, const void *object); +void kasan_release_object_meta(struct kmem_cache *cache, const void *object); #else static inline void kasan_init_object_meta(struct kmem_cache *cache, const void *object) { } +static inline void kasan_release_object_meta(struct kmem_cache *cache, const void *object) { } #endif depot_stack_handle_t kasan_save_stack(gfp_t flags, depot_flags_t depot_flags); diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c index 6958aa713c67ee..3ba02efb952aac 100644 --- a/mm/kasan/quarantine.c +++ b/mm/kasan/quarantine.c @@ -145,10 +145,7 @@ static void qlink_free(struct qlist_node *qlink, struct kmem_cache *cache) void *object = qlink_to_object(qlink, cache); struct kasan_free_meta *free_meta = kasan_get_free_meta(cache, object); - /* - * Note: Keep per-object metadata to allow KASAN print stack traces for - * use-after-free-before-realloc bugs. - */ + kasan_release_object_meta(cache, object); /* * If init_on_free is enabled and KASAN's free metadata is stored in From d0c0d80c116244ac37890506bc2fbe1ef7e6212f Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Thu, 1 Feb 2024 13:03:16 +1100 Subject: [PATCH 660/707] Revert "stackdepot: use variable size records for non-evictable entries" This reverts commit d869d3fb362c51a59c173fdee050dc100ff68383. --- include/linux/poison.h | 3 - lib/stackdepot.c | 250 ++++++++++++++++++++--------------------- 2 files changed, 123 insertions(+), 130 deletions(-) diff --git a/include/linux/poison.h b/include/linux/poison.h index 1f0ee2459f2aa2..27a7dad17eefb8 100644 --- a/include/linux/poison.h +++ b/include/linux/poison.h @@ -92,7 +92,4 @@ /********** VFS **********/ #define VFS_PTR_POISON ((void *)(0xF5 + POISON_POINTER_DELTA)) -/********** lib/stackdepot.c **********/ -#define STACK_DEPOT_POISON ((void *)(0xD390 + POISON_POINTER_DELTA)) - #endif diff --git a/lib/stackdepot.c b/lib/stackdepot.c index 8f3b2c84ec2db3..5caa1f56655384 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include #include @@ -44,7 +43,17 @@ #define DEPOT_OFFSET_BITS (DEPOT_POOL_ORDER + PAGE_SHIFT - DEPOT_STACK_ALIGN) #define DEPOT_POOL_INDEX_BITS (DEPOT_HANDLE_BITS - DEPOT_OFFSET_BITS - \ STACK_DEPOT_EXTRA_BITS) +#if IS_ENABLED(CONFIG_KMSAN) && CONFIG_STACKDEPOT_MAX_FRAMES >= 32 +/* + * KMSAN is frequently used in fuzzing scenarios and thus saves a lot of stack + * traces. As KMSAN does not support evicting stack traces from the stack + * depot, the stack depot capacity might be reached quickly with large stack + * records. Adjust the maximum number of stack depot pools for this case. + */ +#define DEPOT_POOLS_CAP (8192 * (CONFIG_STACKDEPOT_MAX_FRAMES / 16)) +#else #define DEPOT_POOLS_CAP 8192 +#endif #define DEPOT_MAX_POOLS \ (((1LL << (DEPOT_POOL_INDEX_BITS)) < DEPOT_POOLS_CAP) ? \ (1LL << (DEPOT_POOL_INDEX_BITS)) : DEPOT_POOLS_CAP) @@ -84,6 +93,9 @@ struct stack_record { }; }; +#define DEPOT_STACK_RECORD_SIZE \ + ALIGN(sizeof(struct stack_record), 1 << DEPOT_STACK_ALIGN) + static bool stack_depot_disabled; static bool __stack_depot_early_init_requested __initdata = IS_ENABLED(CONFIG_STACKDEPOT_ALWAYS_INIT); static bool __stack_depot_early_init_passed __initdata; @@ -109,31 +121,32 @@ static void *stack_pools[DEPOT_MAX_POOLS]; static void *new_pool; /* Number of pools in stack_pools. */ static int pools_num; -/* Offset to the unused space in the currently used pool. */ -static size_t pool_offset = DEPOT_POOL_SIZE; /* Freelist of stack records within stack_pools. */ static LIST_HEAD(free_stacks); +/* + * Stack depot tries to keep an extra pool allocated even before it runs out + * of space in the currently used pool. This flag marks whether this extra pool + * needs to be allocated. It has the value 0 when either an extra pool is not + * yet allocated or if the limit on the number of pools is reached. + */ +static bool new_pool_required = true; /* The lock must be held when performing pool or freelist modifications. */ static DEFINE_RAW_SPINLOCK(pool_lock); /* Statistics counters for debugfs. */ enum depot_counter_id { - DEPOT_COUNTER_REFD_ALLOCS, - DEPOT_COUNTER_REFD_FREES, - DEPOT_COUNTER_REFD_INUSE, + DEPOT_COUNTER_ALLOCS, + DEPOT_COUNTER_FREES, + DEPOT_COUNTER_INUSE, DEPOT_COUNTER_FREELIST_SIZE, - DEPOT_COUNTER_PERSIST_COUNT, - DEPOT_COUNTER_PERSIST_BYTES, DEPOT_COUNTER_COUNT, }; static long counters[DEPOT_COUNTER_COUNT]; static const char *const counter_names[] = { - [DEPOT_COUNTER_REFD_ALLOCS] = "refcounted_allocations", - [DEPOT_COUNTER_REFD_FREES] = "refcounted_frees", - [DEPOT_COUNTER_REFD_INUSE] = "refcounted_in_use", + [DEPOT_COUNTER_ALLOCS] = "allocations", + [DEPOT_COUNTER_FREES] = "frees", + [DEPOT_COUNTER_INUSE] = "in_use", [DEPOT_COUNTER_FREELIST_SIZE] = "freelist_size", - [DEPOT_COUNTER_PERSIST_COUNT] = "persistent_count", - [DEPOT_COUNTER_PERSIST_BYTES] = "persistent_bytes", }; static_assert(ARRAY_SIZE(counter_names) == DEPOT_COUNTER_COUNT); @@ -281,52 +294,48 @@ int stack_depot_init(void) EXPORT_SYMBOL_GPL(stack_depot_init); /* - * Initializes new stack pool, and updates the list of pools. + * Initializes new stack depot @pool, release all its entries to the freelist, + * and update the list of pools. */ -static bool depot_init_pool(void **prealloc) +static void depot_init_pool(void *pool) { + int offset; + lockdep_assert_held(&pool_lock); - if (unlikely(pools_num >= DEPOT_MAX_POOLS)) { - /* Bail out if we reached the pool limit. */ - WARN_ON_ONCE(pools_num > DEPOT_MAX_POOLS); /* should never happen */ - WARN_ON_ONCE(!new_pool); /* to avoid unnecessary pre-allocation */ - WARN_ONCE(1, "Stack depot reached limit capacity"); - return false; - } + /* Initialize handles and link stack records into the freelist. */ + for (offset = 0; offset <= DEPOT_POOL_SIZE - DEPOT_STACK_RECORD_SIZE; + offset += DEPOT_STACK_RECORD_SIZE) { + struct stack_record *stack = pool + offset; - if (!new_pool && *prealloc) { - /* We have preallocated memory, use it. */ - WRITE_ONCE(new_pool, *prealloc); - *prealloc = NULL; - } + stack->handle.pool_index = pools_num; + stack->handle.offset = offset >> DEPOT_STACK_ALIGN; + stack->handle.extra = 0; - if (!new_pool) - return false; /* new_pool and *prealloc are NULL */ + /* + * Stack traces of size 0 are never saved, and we can simply use + * the size field as an indicator if this is a new unused stack + * record in the freelist. + */ + stack->size = 0; - /* Save reference to the pool to be used by depot_fetch_stack(). */ - stack_pools[pools_num] = new_pool; + INIT_LIST_HEAD(&stack->hash_list); + /* + * Add to the freelist front to prioritize never-used entries: + * required in case there are entries in the freelist, but their + * RCU cookie still belongs to the current RCU grace period + * (there can still be concurrent readers). + */ + list_add(&stack->free_list, &free_stacks); + counters[DEPOT_COUNTER_FREELIST_SIZE]++; + } - /* - * Stack depot tries to keep an extra pool allocated even before it runs - * out of space in the currently used pool. - * - * To indicate that a new preallocation is needed new_pool is reset to - * NULL; do not reset to NULL if we have reached the maximum number of - * pools. - */ - if (pools_num < DEPOT_MAX_POOLS) - WRITE_ONCE(new_pool, NULL); - else - WRITE_ONCE(new_pool, STACK_DEPOT_POISON); + /* Save reference to the pool to be used by depot_fetch_stack(). */ + stack_pools[pools_num] = pool; /* Pairs with concurrent READ_ONCE() in depot_fetch_stack(). */ WRITE_ONCE(pools_num, pools_num + 1); ASSERT_EXCLUSIVE_WRITER(pools_num); - - pool_offset = 0; - - return true; } /* Keeps the preallocated memory to be used for a new stack depot pool. */ @@ -338,51 +347,63 @@ static void depot_keep_new_pool(void **prealloc) * If a new pool is already saved or the maximum number of * pools is reached, do not use the preallocated memory. */ - if (new_pool) + if (!new_pool_required) return; - WRITE_ONCE(new_pool, *prealloc); - *prealloc = NULL; + /* + * Use the preallocated memory for the new pool + * as long as we do not exceed the maximum number of pools. + */ + if (pools_num < DEPOT_MAX_POOLS) { + new_pool = *prealloc; + *prealloc = NULL; + } + + /* + * At this point, either a new pool is kept or the maximum + * number of pools is reached. In either case, take note that + * keeping another pool is not required. + */ + WRITE_ONCE(new_pool_required, false); } /* - * Try to initialize a new stack record from the current pool, a cached pool, or - * the current pre-allocation. + * Try to initialize a new stack depot pool from either a previous or the + * current pre-allocation, and release all its entries to the freelist. */ -static struct stack_record *depot_pop_free_pool(void **prealloc, size_t size) +static bool depot_try_init_pool(void **prealloc) { - struct stack_record *stack; - void *current_pool; - u32 pool_index; - lockdep_assert_held(&pool_lock); - if (pool_offset + size > DEPOT_POOL_SIZE) { - if (!depot_init_pool(prealloc)) - return NULL; - } + /* Check if we have a new pool saved and use it. */ + if (new_pool) { + depot_init_pool(new_pool); + new_pool = NULL; - if (WARN_ON_ONCE(pools_num < 1)) - return NULL; - pool_index = pools_num - 1; - current_pool = stack_pools[pool_index]; - if (WARN_ON_ONCE(!current_pool)) - return NULL; + /* Take note that we might need a new new_pool. */ + if (pools_num < DEPOT_MAX_POOLS) + WRITE_ONCE(new_pool_required, true); - stack = current_pool + pool_offset; + return true; + } - /* Pre-initialize handle once. */ - stack->handle.pool_index = pool_index; - stack->handle.offset = pool_offset >> DEPOT_STACK_ALIGN; - stack->handle.extra = 0; - INIT_LIST_HEAD(&stack->hash_list); + /* Bail out if we reached the pool limit. */ + if (unlikely(pools_num >= DEPOT_MAX_POOLS)) { + WARN_ONCE(1, "Stack depot reached limit capacity"); + return false; + } - pool_offset += size; + /* Check if we have preallocated memory and use it. */ + if (*prealloc) { + depot_init_pool(*prealloc); + *prealloc = NULL; + return true; + } - return stack; + return false; } -/* Try to find next free usable entry from the freelist. */ +/* Try to find next free usable entry. */ static struct stack_record *depot_pop_free(void) { struct stack_record *stack; @@ -399,7 +420,7 @@ static struct stack_record *depot_pop_free(void) * check the first entry. */ stack = list_first_entry(&free_stacks, struct stack_record, free_list); - if (!poll_state_synchronize_rcu(stack->rcu_state)) + if (stack->size && !poll_state_synchronize_rcu(stack->rcu_state)) return NULL; list_del(&stack->free_list); @@ -408,73 +429,48 @@ static struct stack_record *depot_pop_free(void) return stack; } -static inline size_t depot_stack_record_size(struct stack_record *s, unsigned int nr_entries) -{ - const size_t used = flex_array_size(s, entries, nr_entries); - const size_t unused = sizeof(s->entries) - used; - - WARN_ON_ONCE(sizeof(s->entries) < used); - - return ALIGN(sizeof(struct stack_record) - unused, 1 << DEPOT_STACK_ALIGN); -} - /* Allocates a new stack in a stack depot pool. */ static struct stack_record * -depot_alloc_stack(unsigned long *entries, int nr_entries, u32 hash, depot_flags_t flags, void **prealloc) +depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc) { - struct stack_record *stack = NULL; - size_t record_size; + struct stack_record *stack; lockdep_assert_held(&pool_lock); /* This should already be checked by public API entry points. */ - if (WARN_ON_ONCE(!nr_entries)) + if (WARN_ON_ONCE(!size)) return NULL; - /* Limit number of saved frames to CONFIG_STACKDEPOT_MAX_FRAMES. */ - if (nr_entries > CONFIG_STACKDEPOT_MAX_FRAMES) - nr_entries = CONFIG_STACKDEPOT_MAX_FRAMES; - - if (flags & STACK_DEPOT_FLAG_GET) { - /* - * Evictable entries have to allocate the max. size so they may - * safely be re-used by differently sized allocations. - */ - record_size = depot_stack_record_size(stack, CONFIG_STACKDEPOT_MAX_FRAMES); - stack = depot_pop_free(); - } else { - record_size = depot_stack_record_size(stack, nr_entries); - } - + /* Check if we have a stack record to save the stack trace. */ + stack = depot_pop_free(); if (!stack) { - stack = depot_pop_free_pool(prealloc, record_size); - if (!stack) + /* No usable entries on the freelist - try to refill the freelist. */ + if (!depot_try_init_pool(prealloc)) + return NULL; + stack = depot_pop_free(); + if (WARN_ON(!stack)) return NULL; } + /* Limit number of saved frames to CONFIG_STACKDEPOT_MAX_FRAMES. */ + if (size > CONFIG_STACKDEPOT_MAX_FRAMES) + size = CONFIG_STACKDEPOT_MAX_FRAMES; + /* Save the stack trace. */ stack->hash = hash; - stack->size = nr_entries; - /* stack->handle is already filled in by depot_pop_free_pool(). */ - memcpy(stack->entries, entries, flex_array_size(stack, entries, nr_entries)); - - if (flags & STACK_DEPOT_FLAG_GET) { - refcount_set(&stack->count, 1); - counters[DEPOT_COUNTER_REFD_ALLOCS]++; - counters[DEPOT_COUNTER_REFD_INUSE]++; - } else { - /* Warn on attempts to switch to refcounting this entry. */ - refcount_set(&stack->count, REFCOUNT_SATURATED); - counters[DEPOT_COUNTER_PERSIST_COUNT]++; - counters[DEPOT_COUNTER_PERSIST_BYTES] += record_size; - } + stack->size = size; + /* stack->handle is already filled in by depot_init_pool(). */ + refcount_set(&stack->count, 1); + memcpy(stack->entries, entries, flex_array_size(stack, entries, size)); /* * Let KMSAN know the stored stack record is initialized. This shall * prevent false positive reports if instrumented code accesses it. */ - kmsan_unpoison_memory(stack, record_size); + kmsan_unpoison_memory(stack, DEPOT_STACK_RECORD_SIZE); + counters[DEPOT_COUNTER_ALLOCS]++; + counters[DEPOT_COUNTER_INUSE]++; return stack; } @@ -542,8 +538,8 @@ static void depot_free_stack(struct stack_record *stack) list_add_tail(&stack->free_list, &free_stacks); counters[DEPOT_COUNTER_FREELIST_SIZE]++; - counters[DEPOT_COUNTER_REFD_FREES]++; - counters[DEPOT_COUNTER_REFD_INUSE]--; + counters[DEPOT_COUNTER_FREES]++; + counters[DEPOT_COUNTER_INUSE]--; printk_deferred_exit(); raw_spin_unlock_irqrestore(&pool_lock, flags); @@ -664,7 +660,7 @@ depot_stack_handle_t stack_depot_save_flags(unsigned long *entries, * Allocate memory for a new pool if required now: * we won't be able to do that under the lock. */ - if (unlikely(can_alloc && !READ_ONCE(new_pool))) { + if (unlikely(can_alloc && READ_ONCE(new_pool_required))) { /* * Zero out zone modifiers, as we don't have specific zone * requirements. Keep the flags related to allocation in atomic @@ -685,7 +681,7 @@ depot_stack_handle_t stack_depot_save_flags(unsigned long *entries, found = find_stack(bucket, entries, nr_entries, hash, depot_flags); if (!found) { struct stack_record *new = - depot_alloc_stack(entries, nr_entries, hash, depot_flags, &prealloc); + depot_alloc_stack(entries, nr_entries, hash, &prealloc); if (new) { /* From 51b70ff55ed88edd19b080a524063446bcc34b62 Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Thu, 1 Feb 2024 14:29:51 +1100 Subject: [PATCH 661/707] Add linux-next specific files for 20240201 Signed-off-by: Stephen Rothwell --- Next/SHA1s | 370 ++++ Next/Trees | 372 ++++ Next/merge.log | 5227 +++++++++++++++++++++++++++++++++++++++++++++ localversion-next | 1 + 4 files changed, 5970 insertions(+) create mode 100644 Next/SHA1s create mode 100644 Next/Trees create mode 100644 Next/merge.log create mode 100644 localversion-next diff --git a/Next/SHA1s b/Next/SHA1s new file mode 100644 index 00000000000000..ad17af9cebfd34 --- /dev/null +++ b/Next/SHA1s @@ -0,0 +1,370 @@ +Name SHA1 +---- ---- +origin 6764c317b6bb91bd806ef79adf6d9c0e428b191e +fixes 2dde18cd1d8fac735875f2e4987f11817cc0bc2c +mm-hotfixes bbac2bacc15831e9f92dbf7deabe8192c2d8ea92 +kbuild-current bfef491df67022c56aab3b831044f8d259f9441f +arc-current 861deac3b092f37b2c5e6871732f3e11486f7082 +arm-current f54e8634d1366926c807e2af6125b33cff555fa7 +arm64-fixes c7767f5c43df2c453af4651d1f58f489e3eb4ac1 +arm-soc-fixes 1b5af823d703ee183ffdde188aaf584ab93eea19 +davinci-current 6613476e225e090cc9aad49be7fa504e290dd33d +drivers-memory-fixes 6613476e225e090cc9aad49be7fa504e290dd33d +sophgo-fixes 41bccc98fb7931d63d03f326a746ac4d429c1dd3 +tee-fixes ceaa837f96adb69c0df0397937cd74991d5d821a +m68k-current 6b9c045b0602cf64b33ea6da5e6aa6f81dd47ae8 +powerpc-fixes 18f14afe281648e31ed35c9ad2fcb724c4838ad9 +s390-fixes 6613476e225e090cc9aad49be7fa504e290dd33d +fscrypt-current 4bcf6f827a79c59806c695dc280e763c5b6a6813 +fsverity-current a075bacde257f755bea0e53400c9f1cdd1b8e8e6 +net c9ec85153fea6873c52ed4f5055c87263f1b54f9 +bpf 577e4432f3ac810049cb7e6b71f4d96ec7c6e894 +ipsec 983a73da1f996faee9997149eb05b12fa7bd8cbf +netfilter a2933a8759a62269754e54733d993b19de870e84 +ipvs a2933a8759a62269754e54733d993b19de870e84 +wireless f3f8f050316893fe2da523458ff7f5f6d61fb1a6 +wpan b85ea95d086471afb4ad062012a4d73cd328fa86 +rdma-fixes 80dde187f734cf9ccf988d5c2ef1a46b990660fd +sound-current 2468e8922d2f6da81a6192b73023eff67e3fefdd +sound-asoc-fixes eeab239d6a2418fc5d2cd7ea76187085a97acde0 +regmap-fixes 8b921545ddc68c960f86699af906b6c6f361f16c +regulator-fixes a3fa9838e8140584a6f338e8516f2b05d3bea812 +spi-fixes 6500ad28fd5d67d5ca0fee9da73c463090842440 +pci-current 925bd5e08106bd5bfbd1cb8e124c89a0b4003569 +driver-core.current 98323e9d70172f1b46d1cadb20d6c54abf62870d +tty.current b35f8dbbce818b02c730dc85133dc7754266e084 +usb.current f2e5d3de7e1fbf24483e7f996e519b3ebc3935a1 +usb-serial-fixes b4a1f4eaf1d798066affc6ad040f76eb1a16e1c9 +phy 7104ba0f1958adb250319e68a15eff89ec4fd36d +staging.current 6613476e225e090cc9aad49be7fa504e290dd33d +iio-fixes 6f6c72acddf4357fcc83593c20ef9064fb42db92 +counter-current 6613476e225e090cc9aad49be7fa504e290dd33d +char-misc.current ac9762a74c7ca7cbfcb4c65f5871373653a046ac +soundwire-fixes 6613476e225e090cc9aad49be7fa504e290dd33d +thunderbolt-fixes ec4d82f855ce332de26fe080892483de98cc1a19 +input-current 2b9c3eb32a699acdd4784d6b93743271b4970899 +crypto-current c5a2f74db71a849f3a60bc153d684d6d28a0c665 +vfio-fixes 4ea95c04fa6b9043a1a301240996aeebe3cb28ec +kselftest-fixes b54761f6e9773350c0d1fb8e1e5aacaba7769d0f +modules-fixes f412eef03938d3a40d4f6f5a79d0f98ed89b596d +dmaengine-fixes a22fe1d6dec7e98535b97249fdc95c2be79120bb +backlight-fixes 6613476e225e090cc9aad49be7fa504e290dd33d +mtd-fixes 7c1b1906229db88c487e21e1ecb622db64a1830d +mfd-fixes 6613476e225e090cc9aad49be7fa504e290dd33d +v4l-dvb-fixes b32431b753217d8d45b018443b1a7aac215921fb +reset-fixes 4a6756f56bcf8e64c87144a626ce53aea4899c0e +mips-fixes 59be5c35850171e307ca5d3d703ee9ff4096b948 +at91-fixes 6613476e225e090cc9aad49be7fa504e290dd33d +omap-fixes 9b6a51aab5f5f9f71d2fa16e8b4d530e1643dfcb +kvm-fixes 0dd3ee31125508cd67f7e7172247f05b7fd1753a +kvms390-fixes 83303a4c776ce1032d88df59e811183479acea77 +hwmon-fixes 915644189c22d9c93e9fee7c7c993b58e745bef7 +nvdimm-fixes 33908660e814203e996f6e775d033c5c32fcf9a7 +cxl-fixes 6be99530c92c6b8ff7a01903edc42393575ad63b +btrfs-fixes c94bd41cb0b62737d2d05c9e2e9f7aa678046b86 +vfs-fixes 485053bb81c81a122edd982b263277e65d7485c5 +dma-mapping-fixes d5090484b021794271280ab64d20253883b7f6fd +drivers-x86-fixes 1abdf288b0ef5606f76b6e191fa6df05330e3d7e +samsung-krzk-fixes eab4f56d3e75dad697acf8dc2c8be3c341d6c63e +pinctrl-samsung-fixes 6613476e225e090cc9aad49be7fa504e290dd33d +devicetree-fixes 8f7e917907385e112a845d668ae2832f41e64bf5 +dt-krzk-fixes 6613476e225e090cc9aad49be7fa504e290dd33d +scsi-fixes f4469f3858352ad1197434557150b1f7086762a0 +drm-fixes 41bccc98fb7931d63d03f326a746ac4d429c1dd3 +drm-intel-fixes 41bccc98fb7931d63d03f326a746ac4d429c1dd3 +mmc-fixes 41bccc98fb7931d63d03f326a746ac4d429c1dd3 +rtc-fixes 08279468a294d8c996a657ecc9e51bd5c084c75d +gnss-fixes 41bccc98fb7931d63d03f326a746ac4d429c1dd3 +hyperv-fixes 564eac2860bdbe6ac651e6909ac07ecd93d778f3 +soc-fsl-fixes 06c2afb862f9da8dc5efa4b6076a0e48c3fbaaa5 +risc-v-fixes 6613476e225e090cc9aad49be7fa504e290dd33d +riscv-dt-fixes 6613476e225e090cc9aad49be7fa504e290dd33d +riscv-soc-fixes a9d022ae8c4faca73467f70ca4a880787d855f94 +fpga-fixes 6613476e225e090cc9aad49be7fa504e290dd33d +spdx 6613476e225e090cc9aad49be7fa504e290dd33d +gpio-brgl-fixes 41bccc98fb7931d63d03f326a746ac4d429c1dd3 +gpio-intel-fixes 6613476e225e090cc9aad49be7fa504e290dd33d +pinctrl-intel-fixes 6613476e225e090cc9aad49be7fa504e290dd33d +erofs-fixes d9281660ff3ffb4a05302b485cc59a87e709aefc +kunit-fixes 1a9f2c776d1416c4ea6cb0d0b9917778c41a1a7d +ubifs-fixes 2241ab53cbb5cdb08a6b2d4688feb13971058f65 +memblock-fixes 6a9531c3a88096a26cf3ac582f7ec44f94a7dcb2 +nfsd-fixes ccbca118ef1a71d5faa012b9bb1ecd784e9e2b42 +renesas-fixes 9eab43facdadb7d00456c2657001ae2e5353c814 +perf-current fdd0ae72b34e56eb5e896d067c49a78ecb451032 +efi-fixes aa0e784dea7c1a026aabff9db1cb5d2bd92b3e92 +zstd-fixes 77618db346455129424fadbbaec596a09feaf3bb +battery-fixes d0266d7ab1618482d58015d67a5220e590333298 +uml-fixes 73a23d7710331a530e972903318528b75e5a5f58 +iommufd-fixes 6613476e225e090cc9aad49be7fa504e290dd33d +rust-fixes 6613476e225e090cc9aad49be7fa504e290dd33d +v9fs-fixes 6613476e225e090cc9aad49be7fa504e290dd33d +w1-fixes 6613476e225e090cc9aad49be7fa504e290dd33d +pmdomain-fixes c41336f4d69057cbf88fed47951379b384540df5 +overlayfs-fixes 420332b94119cdc7db4477cc88484691cb92ae71 +i2c-host-fixes 9189526c46f2ad14df25dbdc30443f79d03dc7bd +drm-misc-fixes 1c1914d6e8c6edbf5b45047419ff51abdb1dce96 +mm-stable 6613476e225e090cc9aad49be7fa504e290dd33d +mm-nonmm-stable 6613476e225e090cc9aad49be7fa504e290dd33d +mm ec2dacea82ce333584fa31384eb0b55eb2dbc604 +kbuild bd768db42ef6d27c3eca1d29ec8beb4474e4ba35 +clang-format 5a205c6a9f79d14db38006aa2d7c4f4e76b1bfc7 +perf 7727d59de44e4568d0ad0f3867c8bdec69d688fe +compiler-attributes 2993eb7a8d34aee6165e1f6676e81cdf1d22aa62 +dma-mapping 7c65aa3cc072cee76f577262fbe381a111a98774 +asm-generic 34b2321cc648a246d08cc51e423532eac690ccf1 +arc 0bb80ecc33a8fb5a682236443c1e740d5c917d1d +arm 8790fade1a19caf714ba1d91ce1fdceb9f2067f2 +arm64 1b20d0486a602417defb5bf33320d31b2a7a47f8 +arm-perf bb339db4d363c84e0a8d70827df591397ccd7312 +arm-soc 0d1d824a4ac102db35bc8524a8be97ada8ad37ab +amlogic 0dd3ee31125508cd67f7e7172247f05b7fd1753a +asahi-soc ffc253263a1375a65fa6c9f62a893e9767fbebfa +aspeed e60f7a99d3789b5d0b24d3c0571b013309e56815 +at91 6613476e225e090cc9aad49be7fa504e290dd33d +broadcom bbf6d7dc2d94fe03a57b173a13f8fbf9123bb2a8 +davinci 6613476e225e090cc9aad49be7fa504e290dd33d +drivers-memory 2f542c937c48c2bd5a8ddf180b417fbe7152559f +imx-mxs 4db02d61a81ea4ab3bf9ec6878ccf3f2a3f27405 +mediatek 9802b60bd6d895a8be9c44cc5f74bb11131aa8bc +mvebu 476887312c6082e2c03efc3f016e8134c076108e +omap 0012c1958460386adc5770baf2f53206aed77ff3 +qcom f70a1e7dd74f0bf5f58137a4379453873f64797a +renesas 6fc5bb9da080f9f12f2dc13647a695846cb2f8f5 +reset c3c46acd5be9a3351c163d2869045cab4d5342dc +rockchip a3c3232263626d7e6629cc4f134532a2b792c05c +samsung-krzk 819ce8ab3d99371ad17662c22a8843d450ca237a +scmi 99f798bdfb75a8e07f90462399930186c8392997 +sophgo 41bccc98fb7931d63d03f326a746ac4d429c1dd3 +stm32 bda732fda19365b7a7397d0d37090f6dc253232c +sunxi 38ed19495066966979ba821b9e0f549ad5ea620d +tee 84ec4fd8883176442d21454bcecd3c29c0aabad6 +tegra 5e6333ef8ea5ab004f8a24a8ebb0a3bc15b05586 +ti 6613476e225e090cc9aad49be7fa504e290dd33d +xilinx 0ee74e0d7b9786976cc565ac08887469605346e7 +clk efe5a1b888ab0f6acf723e2a12a4644a599294d0 +clk-imx f52f00069888e410cec718792b3e314624f209ea +clk-renesas 096311157d2a6bb8f06e28e1143e2a5de6a0183b +csky 2c40c1c6adab90ee4660caf03722b3a3ec67767b +loongarch 48ef9e87b407f89f230f804815af7ac2031ec17a +m68k 6b9c045b0602cf64b33ea6da5e6aa6f81dd47ae8 +m68knommu 41bccc98fb7931d63d03f326a746ac4d429c1dd3 +microblaze 6613476e225e090cc9aad49be7fa504e290dd33d +mips 6613476e225e090cc9aad49be7fa504e290dd33d +openrisc c289330331eb93bc6a3c68b9119ccd7d4285a4a2 +parisc-hd 913b9d443a0180cf0de3548f1ab3149378998486 +powerpc 44a1aad2fe6c10bfe0589d8047057b10a4c18a19 +soc-fsl fb9c384625dd604e8a5be1f42b35e83104b90670 +risc-v cb4ede926134a65bc3bf90ed58dace8451d7e759 +riscv-dt 2db68ddbf33a76b5913ca281660979b4c48f1df6 +riscv-soc 6613476e225e090cc9aad49be7fa504e290dd33d +s390 8eb3db95a8c8ecd6f8bb082a99ded3bbc79b023f +sh 6613476e225e090cc9aad49be7fa504e290dd33d +uml 83aec96c631e0fa75cfe6d6a1b113a32151aaa88 +xtensa a03cd7602a090eae277d2b79d43925661e7fbe9a +bcachefs 6bb3f7f4c3f4da8e09de188f2f63e8f741bba3bd +pidfd a901a3568fd26ca9c4a82d8bc5ed5b3ed844d451 +fscrypt c919330dd57835970b37676d377de3eaaea2c1e9 +afs abcbd3bfbbfe97a8912d0c929d4aa18f50d9bc52 +btrfs 932ab07c383e655d4a353bb3b67c7164cad64eb3 +ceph ded080c86b3f99683774af0441a58fc2e3d60cae +cifs 2417900a8dcec30415ba9318f02d4346a158c34b +configfs 4425c1d9b44ded655d2668e1ce95a62bccf7b21b +ecryptfs a3d78fe3e1ae8c6a1901635c54a1a799656f72c8 +erofs aa12a790d31be14b289d5a2c6f41ca535fcc7841 +exfat 8b29fa18400ccb7fb681f105d74b2cabb59e5d62 +exportfs 42c3732fa8073717dd7d924472f1c0bc5b452fdc +ext3 cd04011c5859b711e5030bf2e0fd14615be9ccfe +ext4 68da4c44b994aea797eb9821acb3a4a36015293e +f2fs f31438c16879f0612fae83f02b11367c906a7d00 +fsverity 919dc320956ea353a7fb2d84265195ad5ef525ac +fuse 3f29f1c336c0e8a4bec52f1e5217f88835553e5b +gfs2 acd2d246f4b26460d0499bc4e0042f63380e526b +jfs e42e29cc442395d62f1a8963ec2dfb700ba6a5d7 +ksmbd 41bccc98fb7931d63d03f326a746ac4d429c1dd3 +nfs 052d534373b7ed33712a63d5e17b2b6cdbce84fd +nfs-anna 57331a59ac0d680f606403eb24edd3c35aecba31 +nfsd 6c1c91f97746154611770e14f9be4d65a52f77c6 +ntfs3 622cd3daa8eae37359a6fd3c07c36d19f66606b5 +orangefs 31720a2b109b3080eb77e97b8f6f50a27b4ae599 +overlayfs d17bb4620f90f81d8a8a45c3d025c679a1b5efcd +ubifs adbf4c4954e33e623897058a617c583d65a177f6 +v9fs ff49bf1867578f23a5ffdd38f927f6e1e16796c4 +v9fs-ericvh be57855f505003c5cafff40338d5d0f23b00ba4d +xfs 881f78f472556ed05588172d5b5676b48dc48240 +zonefs 8812387d056957355ef1d026cd38bed3830649db +iomap 3ac974796e5d94509b85a403449132ea660127c2 +djw-vfs ce85a1e04645b1ed386b074297df27ab5b8801c0 +file-locks e0152e7481c6c63764d6ea8ee41af5cf9dfac5e9 +iversion e0152e7481c6c63764d6ea8ee41af5cf9dfac5e9 +vfs-brauner de9861b0c277e1d86067255a2634e19a9a0e329b +vfs 052d534373b7ed33712a63d5e17b2b6cdbce84fd +printk 6c3a34e38436a2a3f7a1fa764c108ee19b05b893 +pci 95bf9132f8b48f8ca59eacd9b40afa5cce4feb53 +pstore 24a0b5e196cf70ccff97bc0add6fa7178ad50cc4 +hid a54f72c74c2d6db65b3f3d3bcb9b6487c719d7fb +i2c 41bccc98fb7931d63d03f326a746ac4d429c1dd3 +i2c-host 11f1357336cde9924da0b455e528f11fbd5011f4 +i3c 4fa0888f6f3e6a67cac5afafb23e33f8222cfdd0 +hwmon-staging 6120fec68e78eefdf7d89325668c082a90817c75 +jc_docs 5c7944ca7b13978744ec83e131aef9255fdbabbe +v4l-dvb 6613476e225e090cc9aad49be7fa504e290dd33d +v4l-dvb-next 04447d48afd365a837e23cde631517f166045b9d +pm 7543bfcb6b1a6cecb3e1df9a1bf864d916e8b40b +cpufreq-arm eaffb10b51bf74415c9252fd8fb4dd77122501ee +cpupower 0086ffec768bec6f5d61fc7e406af640eb912a24 +devfreq aed5ed595960c6d301dcd4ed31aeaa7a8054c0c6 +pmdomain 90a7463fae9eb9f68a6e4ff3e8868beb8fbfc649 +opp ace4b31b297dfd7b8c969ff5046c8128c3e025be +thermal 5314b1543787e6cd5d248186fcfd5c5fc4ca2146 +dlm 5beebc1dda47719dac85830c53bca1a0ab497d96 +rdma a400073ce3dd3dbdf843e6c9c0a0a7f6ca9f05d7 +net-next e7f8df0e81bf73ab6dc6ac1dc01273fa06564119 +bpf-next cd1c194ffe28820e0389299060b2cd425ce3ec44 +ipsec-next ab1e1a38de240057ed108075abd1309c02e2a2a4 +mlx5-next d727d27db536faea7178290c677cc0567f647231 +netfilter-next 5264ab612e28058536de8069bcf83eb20fd65c29 +ipvs-next 7ad269787b6615ca56bb161063331991fce51abf +bluetooth 64692e12507b3efd71b4ff5596c9742d91f1ffe5 +wireless-next 3fbf61207c66ff7ac9b60ab76d4bfd239f97e973 +wpan-next 2373699560a754079579b7722b50d1d38de1960e +wpan-staging 2373699560a754079579b7722b50d1d38de1960e +mtd 98d4fda8f2d4bc3fb97958d2ef4c90e161a628f2 +nand 023e6aad7e5e7f2e086c399abd0675589c123728 +spi-nor 3c0e1dfa703cd2a16fbfb1290b0970b61add3cde +crypto 4d314d27130b674a3687135fe94f44a40f107f76 +drm 41bccc98fb7931d63d03f326a746ac4d429c1dd3 +drm-ci ad6bfe1b66a5c146ec236847eca7af4c8806d666 +drm-exynos 6613476e225e090cc9aad49be7fa504e290dd33d +drm-misc 1f1626ac0428820f998245478610f452650bcab5 +amdgpu 9217b91c64587459362f211b0310e2bdaeb67719 +drm-intel fe4c6ff50c68aa467f04c376fa3cf2a60e62c07d +drm-tegra 2429b3c529da29d4277d519bd66d034842dcd70c +drm-msm d4ca26ac4be0d9aea7005c40df75e6775749671b +drm-msm-lumag d4ca26ac4be0d9aea7005c40df75e6775749671b +etnaviv c9959996a8fc171bbb2c2d9c7478306f331a6cca +fbdev 72fee6b0a3a4ad6c5131d4c20e8ab7253b16e38b +regmap a1214cdfe92bd421a449f16d75d4dae2df36060b +sound 8b87a7863fa57f87f5a63fb2dd69a4400593d92c +ieee1394 dd754748f1bef240c38f987cabd70366b7e91474 +sound-asoc e3468b7aab5cf18b86ee67e02b6e70c72e792165 +modules 3559ad395bf02f3dee576dc9acab4ce330ce57b5 +input 7d0f351da46098b3bbb147f886f059473b84ff48 +block b48b5a7c9bc1da4e78105780ace46edd74832ab8 +device-mapper 4eacc39d5529cb0bb6bd9a6608658703d7af1e05 +libata c8474c7273ac3bad718c33118aa82efb7b374f6e +pcmcia 4f733de8b78a209501041a4b0a44c83ece0e8933 +mmc 4e99ffb173faaf38f010acb369bff57a20e9e531 +mfd 1e0ea9e75ff3f395ad6f85f0be2258ef114a53dc +backlight 3b75d271e161e22aff8171940a77510d2fb2ad6f +battery 4c5d387d79a65355b73e526cbf5754a9dcd5377b +regulator a2fc922ece40709ac81f7a9901dd84ecb3298740 +security 5a287d3d2b9de2b3e747132c615599907ba5c3c1 +apparmor 8ead196be219adade3bd0d4115cc9b8506643121 +integrity 1ed4b563100230ea68821a2b25a3d9f25388a3e6 +selinux 90593caf7db74da2300f7a7056a26ae000b3e7cd +smack f0816d4332c3f764cd42cf8124a193e17eeccba9 +tomoyo 0bb80ecc33a8fb5a682236443c1e740d5c917d1d +tpmdd 610347effc2ecb5ededf5037e82240b151f883ab +watchdog 41bccc98fb7931d63d03f326a746ac4d429c1dd3 +iommu 75f74f85a42eb294b657f847c33e1bb7921dbec9 +audit aa13b709084a0287ef250a9fbde5993e4dfc3078 +devicetree 85f838adad5487f96ff3acf6d3eb8263a39a0757 +dt-krzk 8c82b4eef2972200f6171aaa260d7bba2ad29889 +mailbox cd795fb0c352c1f70e5fa437b01572c8693e1b77 +spi 60fbb72e3018c87adc2e3e5ff743ce757c8b4e89 +tip 078b7b997b47c7166c1240cf1d39db9f646a56be +clockevents 0076a37a426b6c850a0b41b814952760e4a70fcf +edac 5f9d6dfd6c4a9efd221faf1925f382f9562a3840 +ftrace 4af12c95cbe888b71e905058c48e8d1e779264b5 +rcu bc31e6cb27a9334140ff2f0a209d59b08bc0bc8c +kvm a9ef277488cfc1b7da88235dc11c338a14f34835 +kvm-arm 87bbb6a32237af19d248957a268b3536cfeca6ba +kvms390 10f7b1dcdfe05efcd26e90e337daf1bfd8f4a6da +kvm-ppc 180c6b072bf360b686e53d893d8dcf7dbbaec6bb +kvm-riscv 4d0e8f9a361b3a1f7b67418c536b258323de734f +kvm-x86 f0f3b810edda57f317d79f452056786257089667 +xen-tip 2d2db7d40254d5fb53b11ebd703cd1ed0c5de7a1 +percpu 2d9ad81ef93570bc0d4929d05d0601ea400d6fcf +workqueues 15930da42f8981dc42c19038042947b475b19f47 +drivers-x86 3f399b5d7189bcb608c75abc85fe39f7a5509cfa +chrome-platform 6613476e225e090cc9aad49be7fa504e290dd33d +chrome-platform-firmware 6613476e225e090cc9aad49be7fa504e290dd33d +hsi fa72d143471d04ce3055d8dad9743b08c19e4060 +leds-lj 4289e434c46c8cbd32cf8b67fa7689b3d2ca4361 +ipmi 296455ade1fdcf5f8f8c033201633b60946c589a +driver-core f297a3844aa059c53be3f69be85ebc071b8a6d16 +usb f1a27f081c1fa1eeebf38406e45f29636114470f +thunderbolt dec6a613574cd3dea799170b7aaa8fd76e22f176 +usb-serial 6613476e225e090cc9aad49be7fa504e290dd33d +tty fccc9d9233f918ee50cf2955ae7134a7f3418351 +char-misc 390b60f7638a8755beee22f83a5fbe54fdc9831d +accel dddb2e526a3650f3aa19c9ed315ae0f49c768810 +coresight 60e5f23dc5d68ec01e6dae8f4311230c7d2ccb8a +fastrpc 6613476e225e090cc9aad49be7fa504e290dd33d +fpga 6613476e225e090cc9aad49be7fa504e290dd33d +icc 7158ba962f419c9e490e187b2a150f9ec5296bfe +iio a0295c1bd4a79461f291c9b0df0523cbbeb75560 +phy-next 25ee21fc97db6cb7f476464e4aa8616652b3be49 +soundwire 0707496ff4e416ea08c90053fd5fde5811b11b22 +extcon 7803680964c025f598f827b7ea7433467ef21a56 +gnss 41bccc98fb7931d63d03f326a746ac4d429c1dd3 +vfio 78f70c02bdbccb5e9b0b0c728185d4aeb7044ace +w1 6613476e225e090cc9aad49be7fa504e290dd33d +spmi b85ea95d086471afb4ad062012a4d73cd328fa86 +staging ce54e9342124ededf0a00ed4e8a8aee535bfbf00 +counter-next 0b3bbd8f9baf245ec77d86f6f5bc902105b4bfa9 +mux 44c026a73be8038f03dbdeef028b642880cf1511 +dmaengine 93bdff7bb83a9ea79f41d4e48e1711fd5f4ec4ed +cgroup 8d4c171f451d384f3a287eb14bd60825d0b2381b +scsi 890d900e7fec7f7956c26bd47b4f0f07a0a507b1 +scsi-mkp 3f90ac7138edb995b4312221647b58afcc15ec06 +vhost f16d65124380ac6de8055c4a8e5373a1043bb09b +rpmsg 99f59b148871dadb9104366e3d25b120a97f897b +gpio 0bb80ecc33a8fb5a682236443c1e740d5c917d1d +gpio-brgl 6933ba529d06afdd3faf5501855e410b46b77160 +gpio-intel 6613476e225e090cc9aad49be7fa504e290dd33d +pinctrl 47eed1127d2af6fada49565efca4671a56e5839d +pinctrl-intel 6613476e225e090cc9aad49be7fa504e290dd33d +pinctrl-renesas fea58424e2523376ece6f734479e63061e17ad7f +pinctrl-samsung 6613476e225e090cc9aad49be7fa504e290dd33d +pwm 979c6fe7e799d2cab0a99c4b8c41cc48f10aca0c +ktest 7dc8e24f0e09834341f84d37433840b353d64bc8 +kselftest 6a71770442b5b7cf8f880ca1c0a72456c918c757 +kunit 6613476e225e090cc9aad49be7fa504e290dd33d +kunit-next 6613476e225e090cc9aad49be7fa504e290dd33d +livepatching 602bf18307981f3bfd9ebf19921791a4256d3fd1 +rtc 14688f1a91e1f37bc6bf50ff5241e857f24338e0 +nvdimm a085a5eb6594a3ebe5c275e9c2c2d341f686c23c +at24 6613476e225e090cc9aad49be7fa504e290dd33d +ntb 9341b37ec17a8793e8439e9b18354ba69556b786 +seccomp 0c6f28a844311b8f81b4b117f586189c704d3f33 +fsi c5eeb63edac9497f9a0d46d3b75cf8b293771ecf +slimbus 04b945e4cf81a12365f8207a4d34dbc81ba17413 +nvmem a0cfd5e997824d0bd8c7620d40cdb324121a2fc7 +xarray 2a15de80dd0f7e04a823291aa9eb49c5294f56af +hyperv ce9ecca0238b140b88f43859b211c9fdfd8e5b70 +auxdisplay c52391fafcefe4c562bdac62088a2735c185b942 +kgdb 4f41d30cd6dc865c3cbc1a852372321eba6d4e4c +hmm 6613476e225e090cc9aad49be7fa504e290dd33d +cfi 06c2afb862f9da8dc5efa4b6076a0e48c3fbaaa5 +mhi 8ddf54a32111f6dbe06cd318af443c6545a6c037 +memblock 2159bd4e905704b1765b6b883ea15e51ad986a6a +cxl 73bf93edeeea866b0b6efbc8d2595bdaaba7f1a5 +zstd 3f832dfb8a8eafee3cecd479d99651a64a61485a +efi 4afa688d7141ae7a166d32224abbfd536acccfca +unicode 367122c529f35b4655acbe33c0cc4d6d3b32ba71 +slab 7d2ec24bd8a59853c7660d3eac50a3b7ffce8ae3 +random 615d300648869c774bd1fe54b4627bb0c20faed4 +landlock 2f8bb71d737c25ca97a83e47b445837aa96cec77 +rust f090f0d0eea9666a96702b29bc9a64cbabee85c5 +sysctl 6613476e225e090cc9aad49be7fa504e290dd33d +execve 90383cc07895183c75a0db2460301c2ffd912359 +bitmap 071ad962baf5e857fd965595421cf6fb588610ed +hte b85ea95d086471afb4ad062012a4d73cd328fa86 +kspp 34b82a2fb7475aba5adfd3ae9f2c66da7f1979f7 +kspp-gustavo 6613476e225e090cc9aad49be7fa504e290dd33d +nolibc 6613476e225e090cc9aad49be7fa504e290dd33d +tsm f4738f56d1dc62aaba69b33702a5ab098f1b8c63 +iommufd 6613476e225e090cc9aad49be7fa504e290dd33d +header_cleanup 5f4c01f1e3c7b0c8d1e5dd6f080531de7aa5e47b diff --git a/Next/Trees b/Next/Trees new file mode 100644 index 00000000000000..f8b6a7c11afd7c --- /dev/null +++ b/Next/Trees @@ -0,0 +1,372 @@ +Trees included into this release: + +Name Type Url +---- ---- --- +origin git git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git#master +fixes git git://git.kernel.org/pub/scm/linux/kernel/git/sfr/next-fixes.git#fixes +mm-hotfixes git git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm#mm-hotfixes-unstable +kbuild-current git git://git.kernel.org/pub/scm/linux/kernel/git/masahiroy/linux-kbuild.git#fixes +arc-current git git://git.kernel.org/pub/scm/linux/kernel/git/vgupta/arc.git#for-curr +arm-current git git://git.armlinux.org.uk/~rmk/linux-arm.git#fixes +arm64-fixes git git://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux#for-next/fixes +arm-soc-fixes git git://git.kernel.org/pub/scm/linux/kernel/git/soc/soc.git#arm/fixes +davinci-current git git://git.kernel.org/pub/scm/linux/kernel/git/brgl/linux.git#davinci/for-current +drivers-memory-fixes git https://git.kernel.org/pub/scm/linux/kernel/git/krzk/linux-mem-ctrl.git#fixes +sophgo-fixes git https://github.com/sophgo/linux.git#fixes +tee-fixes git https://git.linaro.org/people/jens.wiklander/linux-tee.git#fixes +m68k-current git git://git.kernel.org/pub/scm/linux/kernel/git/geert/linux-m68k.git#for-linus +powerpc-fixes git git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git#fixes +s390-fixes git git://git.kernel.org/pub/scm/linux/kernel/git/s390/linux.git#fixes +fscrypt-current git git://git.kernel.org/pub/scm/fs/fscrypt/linux.git#for-current +fsverity-current git git://git.kernel.org/pub/scm/fs/fsverity/linux.git#for-current +net git git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net.git#main +bpf git git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf.git#master +ipsec git git://git.kernel.org/pub/scm/linux/kernel/git/klassert/ipsec.git#master +netfilter git git://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf.git#main +ipvs git git://git.kernel.org/pub/scm/linux/kernel/git/horms/ipvs.git#main +wireless git git://git.kernel.org/pub/scm/linux/kernel/git/wireless/wireless.git#for-next +wpan git git://git.kernel.org/pub/scm/linux/kernel/git/wpan/wpan.git#master +rdma-fixes git git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git#for-rc +sound-current git git://git.kernel.org/pub/scm/linux/kernel/git/tiwai/sound.git#for-linus +sound-asoc-fixes git git://git.kernel.org/pub/scm/linux/kernel/git/broonie/sound.git#for-linus +regmap-fixes git git://git.kernel.org/pub/scm/linux/kernel/git/broonie/regmap.git#for-linus +regulator-fixes git git://git.kernel.org/pub/scm/linux/kernel/git/broonie/regulator.git#for-linus +spi-fixes git git://git.kernel.org/pub/scm/linux/kernel/git/broonie/spi.git#for-linus +pci-current git git://git.kernel.org/pub/scm/linux/kernel/git/pci/pci.git#for-linus +driver-core.current git git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/driver-core.git#driver-core-linus +tty.current git git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/tty.git#tty-linus +usb.current git git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/usb.git#usb-linus +usb-serial-fixes git git://git.kernel.org/pub/scm/linux/kernel/git/johan/usb-serial.git#usb-linus +phy git git://git.kernel.org/pub/scm/linux/kernel/git/phy/linux-phy.git#fixes +staging.current git git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/staging.git#staging-linus +iio-fixes git git://git.kernel.org/pub/scm/linux/kernel/git/jic23/iio.git#fixes-togreg +counter-current git git://git.kernel.org/pub/scm/linux/kernel/git/wbg/counter.git#counter-current +char-misc.current git git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/char-misc.git#char-misc-linus +soundwire-fixes git git://git.kernel.org/pub/scm/linux/kernel/git/vkoul/soundwire.git#fixes +thunderbolt-fixes git git://git.kernel.org/pub/scm/linux/kernel/git/westeri/thunderbolt.git#fixes +input-current git git://git.kernel.org/pub/scm/linux/kernel/git/dtor/input.git#for-linus +crypto-current git git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6.git#master +vfio-fixes git git://github.com/awilliam/linux-vfio.git#for-linus +kselftest-fixes git git://git.kernel.org/pub/scm/linux/kernel/git/shuah/linux-kselftest.git#fixes +modules-fixes git git://git.kernel.org/pub/scm/linux/kernel/git/mcgrof/linux.git#modules-linus +dmaengine-fixes git git://git.kernel.org/pub/scm/linux/kernel/git/vkoul/dmaengine.git#fixes +backlight-fixes git git://git.kernel.org/pub/scm/linux/kernel/git/lee/backlight.git#for-backlight-fixes +mtd-fixes git git://git.kernel.org/pub/scm/linux/kernel/git/mtd/linux.git#mtd/fixes +mfd-fixes git git://git.kernel.org/pub/scm/linux/kernel/git/lee/mfd.git#for-mfd-fixes +v4l-dvb-fixes git https://git.linuxtv.org/media_stage.git#fixes +reset-fixes git https://git.pengutronix.de/git/pza/linux#reset/fixes +mips-fixes git git://git.kernel.org/pub/scm/linux/kernel/git/mips/linux.git#mips-fixes +at91-fixes git git://git.kernel.org/pub/scm/linux/kernel/git/at91/linux.git#at91-fixes +omap-fixes git git://git.kernel.org/pub/scm/linux/kernel/git/tmlind/linux-omap.git#fixes +kvm-fixes git git://git.kernel.org/pub/scm/virt/kvm/kvm.git#master +kvms390-fixes git git://git.kernel.org/pub/scm/linux/kernel/git/kvms390/linux.git#master +hwmon-fixes git git://git.kernel.org/pub/scm/linux/kernel/git/groeck/linux-staging.git#hwmon +nvdimm-fixes git git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm.git#libnvdimm-fixes +cxl-fixes git git://git.kernel.org/pub/scm/linux/kernel/git/cxl/cxl.git#fixes +btrfs-fixes git git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux.git#next-fixes +vfs-fixes git git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs.git#fixes +dma-mapping-fixes git git://git.infradead.org/users/hch/dma-mapping.git#for-linus +drivers-x86-fixes git git://git.kernel.org/pub/scm/linux/kernel/git/pdx86/platform-drivers-x86.git#fixes +samsung-krzk-fixes git git://git.kernel.org/pub/scm/linux/kernel/git/krzk/linux.git#fixes +pinctrl-samsung-fixes git git://git.kernel.org/pub/scm/linux/kernel/git/pinctrl/samsung.git#fixes +devicetree-fixes git git://git.kernel.org/pub/scm/linux/kernel/git/robh/linux.git#dt/linus +dt-krzk-fixes git git://git.kernel.org/pub/scm/linux/kernel/git/krzk/linux-dt.git#fixes +scsi-fixes git git://git.kernel.org/pub/scm/linux/kernel/git/mkp/scsi.git#fixes +drm-fixes git git://git.freedesktop.org/git/drm/drm.git#drm-fixes +drm-intel-fixes git git://anongit.freedesktop.org/drm-intel#for-linux-next-fixes +mmc-fixes git git://git.kernel.org/pub/scm/linux/kernel/git/ulfh/mmc.git#fixes +rtc-fixes git git://git.kernel.org/pub/scm/linux/kernel/git/abelloni/linux.git#rtc-fixes +gnss-fixes git git://git.kernel.org/pub/scm/linux/kernel/git/johan/gnss.git#gnss-linus +hyperv-fixes git git://git.kernel.org/pub/scm/linux/kernel/git/hyperv/linux.git#hyperv-fixes +soc-fsl-fixes git git://git.kernel.org/pub/scm/linux/kernel/git/leo/linux.git#fix +risc-v-fixes git git://git.kernel.org/pub/scm/linux/kernel/git/riscv/linux.git#fixes +riscv-dt-fixes git git://git.kernel.org/pub/scm/linux/kernel/git/conor/linux.git#riscv-dt-fixes +riscv-soc-fixes git git://git.kernel.org/pub/scm/linux/kernel/git/conor/linux.git#riscv-soc-fixes +fpga-fixes git git://git.kernel.org/pub/scm/linux/kernel/git/fpga/linux-fpga.git#fixes +spdx git git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/spdx.git#spdx-linus +gpio-brgl-fixes git git://git.kernel.org/pub/scm/linux/kernel/git/brgl/linux.git#gpio/for-current +gpio-intel-fixes git git://git.kernel.org/pub/scm/linux/kernel/git/andy/linux-gpio-intel.git#fixes +pinctrl-intel-fixes git git://git.kernel.org/pub/scm/linux/kernel/git/pinctrl/intel.git#fixes +erofs-fixes git git://git.kernel.org/pub/scm/linux/kernel/git/xiang/erofs.git#fixes +kunit-fixes git git://git.kernel.org/pub/scm/linux/kernel/git/shuah/linux-kselftest.git#kunit-fixes +ubifs-fixes git git://git.kernel.org/pub/scm/linux/kernel/git/rw/ubifs.git#fixes +memblock-fixes git git://git.kernel.org/pub/scm/linux/kernel/git/rppt/memblock.git#fixes +nfsd-fixes git git://git.kernel.org/pub/scm/linux/kernel/git/cel/linux#nfsd-fixes +renesas-fixes git git://git.kernel.org/pub/scm/linux/kernel/git/geert/renesas-devel.git#fixes +perf-current git git://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools#perf-tools +efi-fixes git git://git.kernel.org/pub/scm/linux/kernel/git/efi/efi.git#urgent +zstd-fixes git https://github.com/terrelln/linux.git#zstd-linus +battery-fixes git git://git.kernel.org/pub/scm/linux/kernel/git/sre/linux-power-supply.git#fixes +uml-fixes git git://git.kernel.org/pub/scm/linux/kernel/git/uml/linux.git#fixes +iommufd-fixes git git://git.kernel.org/pub/scm/linux/kernel/git/jgg/iommufd.git#for-rc +rust-fixes git https://github.com/Rust-for-Linux/linux.git#rust-fixes +v9fs-fixes git git://git.kernel.org/pub/scm/linux/kernel/git/ericvh/v9fs.git#fixes/next +w1-fixes git git://git.kernel.org/pub/scm/linux/kernel/git/krzk/linux-w1.git#fixes +pmdomain-fixes git git://git.kernel.org/pub/scm/linux/kernel/git/ulfh/linux-pm.git#fixes +overlayfs-fixes git git://git.kernel.org/pub/scm/linux/kernel/git/overlayfs/vfs.git#ovl-fixes +i2c-host-fixes git git://git.kernel.org/pub/scm/linux/kernel/git/andi.shyti/linux.git#i2c/i2c-host-fixes +drm-misc-fixes git git://anongit.freedesktop.org/drm/drm-misc#for-linux-next-fixes +mm-stable git git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm#mm-stable +mm-nonmm-stable git git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm#mm-nonmm-stable +mm git git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm#mm-everything +kbuild git git://git.kernel.org/pub/scm/linux/kernel/git/masahiroy/linux-kbuild.git#for-next +clang-format git https://github.com/ojeda/linux.git#clang-format +perf git git://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools-next.git#perf-tools-next +compiler-attributes git https://github.com/ojeda/linux.git#compiler-attributes +dma-mapping git git://git.infradead.org/users/hch/dma-mapping.git#for-next +asm-generic git git://git.kernel.org/pub/scm/linux/kernel/git/arnd/asm-generic.git#master +arc git git://git.kernel.org/pub/scm/linux/kernel/git/vgupta/arc.git#for-next +arm git git://git.armlinux.org.uk/~rmk/linux-arm.git#for-next +arm64 git git://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux#for-next/core +arm-perf git git://git.kernel.org/pub/scm/linux/kernel/git/will/linux.git#for-next/perf +arm-soc git git://git.kernel.org/pub/scm/linux/kernel/git/soc/soc.git#for-next +amlogic git git://git.kernel.org/pub/scm/linux/kernel/git/amlogic/linux.git#for-next +asahi-soc git https://github.com/AsahiLinux/linux.git#asahi-soc/for-next +aspeed git git://git.kernel.org/pub/scm/linux/kernel/git/joel/bmc.git#for-next +at91 git git://git.kernel.org/pub/scm/linux/kernel/git/at91/linux.git#at91-next +broadcom git https://github.com/Broadcom/stblinux.git#next +davinci git git://git.kernel.org/pub/scm/linux/kernel/git/brgl/linux.git#davinci/for-next +drivers-memory git https://git.kernel.org/pub/scm/linux/kernel/git/krzk/linux-mem-ctrl.git#for-next +imx-mxs git git://git.kernel.org/pub/scm/linux/kernel/git/shawnguo/linux.git#for-next +mediatek git git://git.kernel.org/pub/scm/linux/kernel/git/matthias.bgg/linux.git#for-next +mvebu git git://git.kernel.org/pub/scm/linux/kernel/git/gclement/mvebu.git#for-next +omap git git://git.kernel.org/pub/scm/linux/kernel/git/tmlind/linux-omap.git#for-next +qcom git git://git.kernel.org/pub/scm/linux/kernel/git/qcom/linux.git#for-next +renesas git git://git.kernel.org/pub/scm/linux/kernel/git/geert/renesas-devel.git#next +reset git https://git.pengutronix.de/git/pza/linux#reset/next +rockchip git git://git.kernel.org/pub/scm/linux/kernel/git/mmind/linux-rockchip.git#for-next +samsung-krzk git git://git.kernel.org/pub/scm/linux/kernel/git/krzk/linux.git#for-next +scmi git git://git.kernel.org/pub/scm/linux/kernel/git/sudeep.holla/linux.git#for-linux-next +sophgo git https://github.com/sophgo/linux.git#for-next +stm32 git git://git.kernel.org/pub/scm/linux/kernel/git/atorgue/stm32.git#stm32-next +sunxi git git://git.kernel.org/pub/scm/linux/kernel/git/sunxi/linux.git#sunxi/for-next +tee git https://git.linaro.org/people/jens.wiklander/linux-tee.git#next +tegra git git://git.kernel.org/pub/scm/linux/kernel/git/tegra/linux.git#for-next +ti git git://git.kernel.org/pub/scm/linux/kernel/git/ti/linux.git#ti-next +xilinx git git://github.com/Xilinx/linux-xlnx.git#for-next +clk git git://git.kernel.org/pub/scm/linux/kernel/git/clk/linux.git#clk-next +clk-imx git git://git.kernel.org/pub/scm/linux/kernel/git/abelvesa/linux.git#for-next +clk-renesas git git://git.kernel.org/pub/scm/linux/kernel/git/geert/renesas-drivers.git#renesas-clk +csky git git://github.com/c-sky/csky-linux.git#linux-next +loongarch git git://git.kernel.org/pub/scm/linux/kernel/git/chenhuacai/linux-loongson.git#loongarch-next +m68k git git://git.kernel.org/pub/scm/linux/kernel/git/geert/linux-m68k.git#for-next +m68knommu git git://git.kernel.org/pub/scm/linux/kernel/git/gerg/m68knommu.git#for-next +microblaze git git://git.monstr.eu/linux-2.6-microblaze.git#next +mips git git://git.kernel.org/pub/scm/linux/kernel/git/mips/linux.git#mips-next +openrisc git git://github.com/openrisc/linux.git#for-next +parisc-hd git git://git.kernel.org/pub/scm/linux/kernel/git/deller/parisc-linux.git#for-next +powerpc git git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git#next +soc-fsl git git://git.kernel.org/pub/scm/linux/kernel/git/leo/linux.git#next +risc-v git git://git.kernel.org/pub/scm/linux/kernel/git/riscv/linux.git#for-next +riscv-dt git git://git.kernel.org/pub/scm/linux/kernel/git/conor/linux.git#riscv-dt-for-next +riscv-soc git git://git.kernel.org/pub/scm/linux/kernel/git/conor/linux.git#riscv-soc-for-next +s390 git git://git.kernel.org/pub/scm/linux/kernel/git/s390/linux.git#for-next +sh git git:git.kernel.org/pub/scm/linux/kernel/git/glaubitz/sh-linux.git#for-next +uml git git://git.kernel.org/pub/scm/linux/kernel/git/uml/linux.git#next +xtensa git git://github.com/jcmvbkbc/linux-xtensa.git#xtensa-for-next +bcachefs git https://evilpiepirate.org/git/bcachefs.git#for-next +pidfd git git://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux.git#for-next +fscrypt git git://git.kernel.org/pub/scm/fs/fscrypt/linux.git#for-next +afs git git://git.kernel.org/pub/scm/linux/kernel/git/dhowells/linux-fs.git#afs-next +btrfs git git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux.git#for-next +ceph git git://github.com/ceph/ceph-client.git#master +cifs git git://git.samba.org/sfrench/cifs-2.6.git#for-next +configfs git git://git.infradead.org/users/hch/configfs.git#for-next +ecryptfs git git://git.kernel.org/pub/scm/linux/kernel/git/tyhicks/ecryptfs.git#next +erofs git git://git.kernel.org/pub/scm/linux/kernel/git/xiang/erofs.git#dev +exfat git git://git.kernel.org/pub/scm/linux/kernel/git/linkinjeon/exfat.git#dev +exportfs git git://git.kernel.org/pub/scm/linux/kernel/git/cel/linux#exportfs-next +ext3 git git://git.kernel.org/pub/scm/linux/kernel/git/jack/linux-fs.git#for_next +ext4 git git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4.git#dev +f2fs git git://git.kernel.org/pub/scm/linux/kernel/git/jaegeuk/f2fs.git#dev +fsverity git git://git.kernel.org/pub/scm/fs/fsverity/linux.git#for-next +fuse git git://git.kernel.org/pub/scm/linux/kernel/git/mszeredi/fuse.git#for-next +gfs2 git git://git.kernel.org/pub/scm/linux/kernel/git/gfs2/linux-gfs2.git#for-next +jfs git git://github.com/kleikamp/linux-shaggy.git#jfs-next +ksmbd git https://github.com/smfrench/smb3-kernel.git#ksmbd-for-next +nfs git git://git.linux-nfs.org/projects/trondmy/nfs-2.6.git#linux-next +nfs-anna git git://git.linux-nfs.org/projects/anna/linux-nfs.git#linux-next +nfsd git git://git.kernel.org/pub/scm/linux/kernel/git/cel/linux#nfsd-next +ntfs3 git https://github.com/Paragon-Software-Group/linux-ntfs3.git#master +orangefs git git://git.kernel.org/pub/scm/linux/kernel/git/hubcap/linux#for-next +overlayfs git git://git.kernel.org/pub/scm/linux/kernel/git/overlayfs/vfs.git#overlayfs-next +ubifs git git://git.kernel.org/pub/scm/linux/kernel/git/rw/ubifs.git#next +v9fs git git://github.com/martinetd/linux#9p-next +v9fs-ericvh git git://git.kernel.org/pub/scm/linux/kernel/git/ericvh/v9fs.git#ericvh/for-next +xfs git git://git.kernel.org/pub/scm/fs/xfs/xfs-linux.git#for-next +zonefs git git://git.kernel.org/pub/scm/linux/kernel/git/dlemoal/zonefs.git#for-next +iomap git git://git.kernel.org/pub/scm/fs/xfs/xfs-linux.git#iomap-for-next +djw-vfs git git://git.kernel.org/pub/scm/fs/xfs/xfs-linux.git#vfs-for-next +file-locks git git://git.kernel.org/pub/scm/linux/kernel/git/jlayton/linux.git#locks-next +iversion git git://git.kernel.org/pub/scm/linux/kernel/git/jlayton/linux.git#iversion-next +vfs-brauner git git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs.git#vfs.all +vfs git git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs.git#for-next +printk git git://git.kernel.org/pub/scm/linux/kernel/git/printk/linux.git#for-next +pci git git://git.kernel.org/pub/scm/linux/kernel/git/pci/pci.git#next +pstore git git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git#for-next/pstore +hid git git://git.kernel.org/pub/scm/linux/kernel/git/hid/hid.git#for-next +i2c git git://git.kernel.org/pub/scm/linux/kernel/git/wsa/linux.git#i2c/for-next +i2c-host git git://git.kernel.org/pub/scm/linux/kernel/git/andi.shyti/linux.git#i2c/i2c-host +i3c git git://git.kernel.org/pub/scm/linux/kernel/git/i3c/linux.git#i3c/next +hwmon-staging git git://git.kernel.org/pub/scm/linux/kernel/git/groeck/linux-staging.git#hwmon-next +jc_docs git git://git.lwn.net/linux.git#docs-next +v4l-dvb git git://linuxtv.org/media_tree.git#master +v4l-dvb-next git git://linuxtv.org/mchehab/media-next.git#master +pm git git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm.git#linux-next +cpufreq-arm git git://git.kernel.org/pub/scm/linux/kernel/git/vireshk/pm.git#cpufreq/arm/linux-next +cpupower git git://git.kernel.org/pub/scm/linux/kernel/git/shuah/linux.git#cpupower +devfreq git git://git.kernel.org/pub/scm/linux/kernel/git/chanwoo/linux.git#devfreq-next +pmdomain git git://git.kernel.org/pub/scm/linux/kernel/git/ulfh/linux-pm.git#next +opp git git://git.kernel.org/pub/scm/linux/kernel/git/vireshk/pm.git#opp/linux-next +thermal git git://git.kernel.org/pub/scm/linux/kernel/git/thermal/linux.git#thermal/linux-next +dlm git git://git.kernel.org/pub/scm/linux/kernel/git/teigland/linux-dlm.git#next +rdma git git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git#for-next +net-next git git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git#main +bpf-next git git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git#for-next +ipsec-next git git://git.kernel.org/pub/scm/linux/kernel/git/klassert/ipsec-next.git#master +mlx5-next git git://git.kernel.org/pub/scm/linux/kernel/git/mellanox/linux.git#mlx5-next +netfilter-next git git://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf-next.git#main +ipvs-next git git://git.kernel.org/pub/scm/linux/kernel/git/horms/ipvs-next.git#main +bluetooth git git://git.kernel.org/pub/scm/linux/kernel/git/bluetooth/bluetooth-next.git#master +wireless-next git git://git.kernel.org/pub/scm/linux/kernel/git/wireless/wireless-next.git#for-next +wpan-next git git://git.kernel.org/pub/scm/linux/kernel/git/wpan/wpan-next.git#master +wpan-staging git git://git.kernel.org/pub/scm/linux/kernel/git/wpan/wpan-next.git#staging +mtd git git://git.kernel.org/pub/scm/linux/kernel/git/mtd/linux.git#mtd/next +nand git git://git.kernel.org/pub/scm/linux/kernel/git/mtd/linux.git#nand/next +spi-nor git git://git.kernel.org/pub/scm/linux/kernel/git/mtd/linux.git#spi-nor/next +crypto git git://git.kernel.org/pub/scm/linux/kernel/git/herbert/cryptodev-2.6.git#master +drm git git://git.freedesktop.org/git/drm/drm.git#drm-next +drm-ci git git://git.freedesktop.org/git/drm/drm.git#topic/drm-ci +drm-exynos git git://git.kernel.org/pub/scm/linux/kernel/git/daeinki/drm-exynos.git#for-linux-next +drm-misc git git://anongit.freedesktop.org/drm/drm-misc#for-linux-next +amdgpu git https://gitlab.freedesktop.org/agd5f/linux#drm-next +drm-intel git git://anongit.freedesktop.org/drm-intel#for-linux-next +drm-tegra git https://gitlab.freedesktop.org/drm/tegra.git#for-next +drm-msm git https://gitlab.freedesktop.org/drm/msm.git#msm-next +drm-msm-lumag git https://gitlab.freedesktop.org/lumag/msm.git#msm-next-lumag +etnaviv git https://git.pengutronix.de/git/lst/linux#etnaviv/next +fbdev git git://git.kernel.org/pub/scm/linux/kernel/git/deller/linux-fbdev.git#for-next +regmap git git://git.kernel.org/pub/scm/linux/kernel/git/broonie/regmap.git#for-next +sound git git://git.kernel.org/pub/scm/linux/kernel/git/tiwai/sound.git#for-next +ieee1394 git https://git.kernel.org/pub/scm/linux/kernel/git/ieee1394/linux1394.git#for-next +sound-asoc git git://git.kernel.org/pub/scm/linux/kernel/git/broonie/sound.git#for-next +modules git git://git.kernel.org/pub/scm/linux/kernel/git/mcgrof/linux.git#modules-next +input git git://git.kernel.org/pub/scm/linux/kernel/git/dtor/input.git#next +block git git://git.kernel.dk/linux-block.git#for-next +device-mapper git git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm.git#for-next +libata git git://git.kernel.org/pub/scm/linux/kernel/git/libata/linux#for-next +pcmcia git git://git.kernel.org/pub/scm/linux/kernel/git/brodo/linux.git#pcmcia-next +mmc git git://git.kernel.org/pub/scm/linux/kernel/git/ulfh/mmc.git#next +mfd git git://git.kernel.org/pub/scm/linux/kernel/git/lee/mfd.git#for-mfd-next +backlight git git://git.kernel.org/pub/scm/linux/kernel/git/lee/backlight.git#for-backlight-next +battery git git://git.kernel.org/pub/scm/linux/kernel/git/sre/linux-power-supply.git#for-next +regulator git git://git.kernel.org/pub/scm/linux/kernel/git/broonie/regulator.git#for-next +security git git://git.kernel.org/pub/scm/linux/kernel/git/pcmoore/lsm.git#next +apparmor git git://git.kernel.org/pub/scm/linux/kernel/git/jj/linux-apparmor#apparmor-next +integrity git git://git.kernel.org/pub/scm/linux/kernel/git/zohar/linux-integrity#next-integrity +selinux git git://git.kernel.org/pub/scm/linux/kernel/git/pcmoore/selinux.git#next +smack git git://github.com/cschaufler/smack-next#next +tomoyo git https://scm.osdn.net/gitroot/tomoyo/tomoyo-test1.git#master +tpmdd git git://git.kernel.org/pub/scm/linux/kernel/git/jarkko/linux-tpmdd.git#next +watchdog git git://www.linux-watchdog.org/linux-watchdog-next.git#master +iommu git git://git.kernel.org/pub/scm/linux/kernel/git/joro/iommu.git#next +audit git git://git.kernel.org/pub/scm/linux/kernel/git/pcmoore/audit.git#next +devicetree git git://git.kernel.org/pub/scm/linux/kernel/git/robh/linux.git#for-next +dt-krzk git git://git.kernel.org/pub/scm/linux/kernel/git/krzk/linux-dt.git#for-next +mailbox git git://git.kernel.org/pub/scm/linux/kernel/git/jassibrar/mailbox.git#for-next +spi git git://git.kernel.org/pub/scm/linux/kernel/git/broonie/spi.git#for-next +tip git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git#master +clockevents git git://git.linaro.org/people/daniel.lezcano/linux.git#timers/drivers/next +edac git git://git.kernel.org/pub/scm/linux/kernel/git/ras/ras.git#edac-for-next +ftrace git git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace.git#for-next +rcu git git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu.git#rcu/next +kvm git git://git.kernel.org/pub/scm/virt/kvm/kvm.git#next +kvm-arm git git://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm.git#next +kvms390 git git://git.kernel.org/pub/scm/linux/kernel/git/kvms390/linux.git#next +kvm-ppc git git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git#topic/ppc-kvm +kvm-riscv git https://github.com/kvm-riscv/linux.git#riscv_kvm_next +kvm-x86 git https://github.com/kvm-x86/linux.git#next +xen-tip git git://git.kernel.org/pub/scm/linux/kernel/git/xen/tip.git#linux-next +percpu git git://git.kernel.org/pub/scm/linux/kernel/git/dennis/percpu.git#for-next +workqueues git git://git.kernel.org/pub/scm/linux/kernel/git/tj/wq.git#for-next +drivers-x86 git git://git.kernel.org/pub/scm/linux/kernel/git/pdx86/platform-drivers-x86.git#for-next +chrome-platform git git://git.kernel.org/pub/scm/linux/kernel/git/chrome-platform/linux.git#for-next +chrome-platform-firmware git git://git.kernel.org/pub/scm/linux/kernel/git/chrome-platform/linux.git#for-firmware-next +hsi git git://git.kernel.org/pub/scm/linux/kernel/git/sre/linux-hsi.git#for-next +leds-lj git git://git.kernel.org/pub/scm/linux/kernel/git/lee/leds.git#for-leds-next +ipmi git git://github.com/cminyard/linux-ipmi.git#for-next +driver-core git git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/driver-core.git#driver-core-next +usb git git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/usb.git#usb-next +thunderbolt git git://git.kernel.org/pub/scm/linux/kernel/git/westeri/thunderbolt.git#next +usb-serial git git://git.kernel.org/pub/scm/linux/kernel/git/johan/usb-serial.git#usb-next +tty git git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/tty.git#tty-next +char-misc git git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/char-misc.git#char-misc-next +accel git git://git.kernel.org/pub/scm/linux/kernel/git/ogabbay/linux.git#habanalabs-next +coresight git git://git.kernel.org/pub/scm/linux/kernel/git/coresight/linux.git#next +fastrpc git git://git.kernel.org/pub/scm/linux/kernel/git/srini/fastrpc.git#for-next +fpga git git://git.kernel.org/pub/scm/linux/kernel/git/fpga/linux-fpga.git#for-next +icc git git://git.kernel.org/pub/scm/linux/kernel/git/djakov/icc.git#icc-next +iio git git://git.kernel.org/pub/scm/linux/kernel/git/jic23/iio.git#togreg +phy-next git git://git.kernel.org/pub/scm/linux/kernel/git/phy/linux-phy.git#next +soundwire git git://git.kernel.org/pub/scm/linux/kernel/git/vkoul/soundwire.git#next +extcon git git://git.kernel.org/pub/scm/linux/kernel/git/chanwoo/extcon.git#extcon-next +gnss git git://git.kernel.org/pub/scm/linux/kernel/git/johan/gnss.git#gnss-next +vfio git git://github.com/awilliam/linux-vfio.git#next +w1 git git://git.kernel.org/pub/scm/linux/kernel/git/krzk/linux-w1.git#for-next +spmi git git://git.kernel.org/pub/scm/linux/kernel/git/sboyd/spmi.git#spmi-next +staging git git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/staging.git#staging-next +counter-next git git://git.kernel.org/pub/scm/linux/kernel/git/wbg/counter.git#counter-next +mux git https://gitlab.com/peda-linux/mux.git#for-next +dmaengine git git://git.kernel.org/pub/scm/linux/kernel/git/vkoul/dmaengine.git#next +cgroup git git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git#for-next +scsi git git://git.kernel.org/pub/scm/linux/kernel/git/jejb/scsi.git#for-next +scsi-mkp git git://git.kernel.org/pub/scm/linux/kernel/git/mkp/scsi.git#for-next +vhost git git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost.git#linux-next +rpmsg git git://git.kernel.org/pub/scm/linux/kernel/git/remoteproc/linux.git#for-next +gpio git git://git.kernel.org/pub/scm/linux/kernel/git/linusw/linux-gpio.git#for-next +gpio-brgl git git://git.kernel.org/pub/scm/linux/kernel/git/brgl/linux.git#gpio/for-next +gpio-intel git git://git.kernel.org/pub/scm/linux/kernel/git/andy/linux-gpio-intel.git#for-next +pinctrl git git://git.kernel.org/pub/scm/linux/kernel/git/linusw/linux-pinctrl.git#for-next +pinctrl-intel git git://git.kernel.org/pub/scm/linux/kernel/git/pinctrl/intel.git#for-next +pinctrl-renesas git git://git.kernel.org/pub/scm/linux/kernel/git/geert/renesas-drivers.git#renesas-pinctrl +pinctrl-samsung git git://git.kernel.org/pub/scm/linux/kernel/git/pinctrl/samsung.git#for-next +pwm git git://git.kernel.org/pub/scm/linux/kernel/git/ukleinek/linux.git#pwm/for-next +ktest git git://git.kernel.org/pub/scm/linux/kernel/git/rostedt/linux-ktest.git#for-next +kselftest git git://git.kernel.org/pub/scm/linux/kernel/git/shuah/linux-kselftest.git#next +kunit git git://git.kernel.org/pub/scm/linux/kernel/git/shuah/linux-kselftest.git#test +kunit-next git git://git.kernel.org/pub/scm/linux/kernel/git/shuah/linux-kselftest.git#kunit +livepatching git git://git.kernel.org/pub/scm/linux/kernel/git/livepatching/livepatching#for-next +rtc git git://git.kernel.org/pub/scm/linux/kernel/git/abelloni/linux.git#rtc-next +nvdimm git git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm.git#libnvdimm-for-next +at24 git git://git.kernel.org/pub/scm/linux/kernel/git/brgl/linux.git#at24/for-next +ntb git https://github.com/jonmason/ntb.git#ntb-next +seccomp git git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git#for-next/seccomp +fsi git git://git.kernel.org/pub/scm/linux/kernel/git/joel/fsi.git#next +slimbus git git://git.kernel.org/pub/scm/linux/kernel/git/srini/slimbus.git#for-next +nvmem git git://git.kernel.org/pub/scm/linux/kernel/git/srini/nvmem.git#for-next +xarray git git://git.infradead.org/users/willy/xarray.git#main +hyperv git git://git.kernel.org/pub/scm/linux/kernel/git/hyperv/linux.git#hyperv-next +auxdisplay git https://github.com/ojeda/linux.git#auxdisplay +kgdb git git://git.kernel.org/pub/scm/linux/kernel/git/danielt/linux.git#kgdb/for-next +hmm git git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git#hmm +cfi git git://git.kernel.org/pub/scm/linux/kernel/git/mtd/linux.git#cfi/next +mhi git git://git.kernel.org/pub/scm/linux/kernel/git/mani/mhi.git#mhi-next +memblock git git://git.kernel.org/pub/scm/linux/kernel/git/rppt/memblock.git#for-next +cxl git git://git.kernel.org/pub/scm/linux/kernel/git/cxl/cxl.git#next +zstd git https://github.com/terrelln/linux.git#zstd-next +efi git git://git.kernel.org/pub/scm/linux/kernel/git/efi/efi.git#next +unicode git git://git.kernel.org/pub/scm/linux/kernel/git/krisman/unicode.git#for-next +slab git git://git.kernel.org/pub/scm/linux/kernel/git/vbabka/slab.git#slab/for-next +random git git://git.kernel.org/pub/scm/linux/kernel/git/crng/random.git#master +landlock git git://git.kernel.org/pub/scm/linux/kernel/git/mic/linux.git#next +rust git https://github.com/Rust-for-Linux/linux.git#rust-next +sysctl git git://git.kernel.org/pub/scm/linux/kernel/git/sysctl/sysctl.git#sysctl-next +execve git git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git#for-next/execve +bitmap git https://github.com/norov/linux.git#bitmap-for-next +hte git git://git.kernel.org/pub/scm/linux/kernel/git/pateldipen1984/linux.git#for-next +kspp git git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git#for-next/kspp +kspp-gustavo git git://git.kernel.org/pub/scm/linux/kernel/git/gustavoars/linux.git#for-next/kspp +nolibc git git://git.kernel.org/pub/scm/linux/kernel/git/shuah/linux-kselftest.git#nolibc +tsm git git://git.kernel.org/pub/scm/linux/kernel/git/djbw/linux#tsm-next +iommufd git git://git.kernel.org/pub/scm/linux/kernel/git/jgg/iommufd.git#for-next +header_cleanup git https://evilpiepirate.org/git/bcachefs.git#header_cleanup diff --git a/Next/merge.log b/Next/merge.log new file mode 100644 index 00000000000000..32af96d5094a9c --- /dev/null +++ b/Next/merge.log @@ -0,0 +1,5227 @@ +$ date -R +Thu, 01 Feb 2024 08:11:51 +1100 +$ git checkout master +Already on 'master' +$ git reset --hard stable +HEAD is now at 861c0981648f Merge tag 'jfs-6.8-rc3' of github.com:kleikamp/linux-shaggy +Merging origin/master (6764c317b6bb Merge tag 'scsi-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/jejb/scsi) +$ git merge -m Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git origin/master +Updating 861c0981648f..6764c317b6bb +Fast-forward (no commit created; -m option ignored) + Documentation/dev-tools/kunit/usage.rst | 19 +++- + MAINTAINERS | 3 +- + drivers/scsi/initio.c | 3 +- + drivers/scsi/isci/request.c | 2 +- + drivers/scsi/scsi_error.c | 8 +- + drivers/scsi/scsi_lib.c | 2 +- + drivers/scsi/scsi_priv.h | 2 +- + drivers/scsi/storvsc_drv.c | 12 ++- + drivers/scsi/virtio_scsi.c | 2 - + drivers/soc/apple/mailbox.c | 6 +- + fs/erofs/compress.h | 5 +- + fs/erofs/decompressor.c | 5 +- + fs/erofs/decompressor_deflate.c | 19 ++-- + fs/erofs/decompressor_lzma.c | 17 +++- + fs/erofs/fscache.c | 2 +- + fs/erofs/inode.c | 2 +- + fs/erofs/utils.c | 2 +- + fs/erofs/zdata.c | 98 ++++++++++--------- + lib/kunit/device.c | 4 +- + lib/kunit/executor.c | 4 + + lib/kunit/kunit-test.c | 2 +- + lib/kunit/test.c | 14 ++- + tools/testing/selftests/livepatch/functions.sh | 37 ++++---- + .../testing/selftests/rseq/basic_percpu_ops_test.c | 14 ++- + tools/testing/selftests/rseq/param_test.c | 22 +++-- + .../testing/selftests/seccomp/seccomp_benchmark.c | 104 +++++++++++++-------- + 26 files changed, 251 insertions(+), 159 deletions(-) +Merging fixes/fixes (2dde18cd1d8f Linux 6.5) +$ git merge -m Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/sfr/next-fixes.git fixes/fixes +Already up to date. +Merging mm-hotfixes/mm-hotfixes-unstable (bbac2bacc158 mm/zswap: don't return LRU_SKIP if we have dropped lru lock) +$ git merge -m Merge branch 'mm-hotfixes-unstable' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm mm-hotfixes/mm-hotfixes-unstable +Merge made by the 'ort' strategy. + .mailmap | 1 + + arch/arm/mm/fault.c | 2 + + fs/hugetlbfs/inode.c | 19 +++++-- + fs/nilfs2/recovery.c | 7 +-- + fs/proc/array.c | 66 ++++++++++++++----------- + fs/xfs/scrub/xfile.c | 5 ++ + include/linux/pagemap.h | 14 ++++++ + kernel/exit.c | 10 ++-- + kernel/sys.c | 54 ++++++++++++-------- + mm/madvise.c | 1 + + mm/memcontrol.c | 49 +++++++++++------- + mm/memory-failure.c | 3 ++ + mm/userfaultfd.c | 14 +++--- + mm/zswap.c | 12 ++--- + tools/testing/selftests/core/close_range_test.c | 1 + + 15 files changed, 165 insertions(+), 93 deletions(-) +Merging kbuild-current/fixes (bfef491df670 kconfig: initialize sym->curr.tri to 'no' for all symbol types again) +$ git merge -m Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/masahiroy/linux-kbuild.git kbuild-current/fixes +Auto-merging Makefile +Merge made by the 'ort' strategy. + Makefile | 14 +++++++------- + arch/m68k/Makefile | 4 ++-- + arch/parisc/Makefile | 4 ++-- + arch/um/Makefile | 4 +++- + arch/x86/Makefile | 10 +++++----- + scripts/Makefile.defconf | 8 ++++---- + scripts/kconfig/symbol.c | 4 +++- + scripts/mod/modpost.c | 15 +++------------ + scripts/mod/modpost.h | 6 +----- + scripts/package/kernel.spec | 22 +++++++++++----------- + 10 files changed, 41 insertions(+), 50 deletions(-) +Merging arc-current/for-curr (861deac3b092 Linux 6.7-rc7) +$ git merge -m Merge branch 'for-curr' of git://git.kernel.org/pub/scm/linux/kernel/git/vgupta/arc.git arc-current/for-curr +Already up to date. +Merging arm-current/fixes (f54e8634d136 ARM: 9330/1: davinci: also select PINCTRL) +$ git merge -m Merge branch 'fixes' of git://git.armlinux.org.uk/~rmk/linux-arm.git arm-current/fixes +Already up to date. +Merging arm64-fixes/for-next/fixes (c7767f5c43df arm64: vdso32: Remove unused vdso32-offsets.h) +$ git merge -m Merge branch 'for-next/fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux arm64-fixes/for-next/fixes +Merge made by the 'ort' strategy. + arch/arm64/Makefile | 2 +- + arch/arm64/include/asm/vdso.h | 3 --- + arch/arm64/kernel/Makefile | 6 +++--- + arch/arm64/kernel/vdso32/Makefile | 9 --------- + 4 files changed, 4 insertions(+), 16 deletions(-) +Merging arm-soc-fixes/arm/fixes (1b5af823d703 soc/tegra: fix build failure on Tegra241) +$ git merge -m Merge branch 'arm/fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/soc/soc.git arm-soc-fixes/arm/fixes +Merge made by the 'ort' strategy. + drivers/soc/tegra/fuse/fuse-tegra30.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) +Merging davinci-current/davinci/for-current (6613476e225e Linux 6.8-rc1) +$ git merge -m Merge branch 'davinci/for-current' of git://git.kernel.org/pub/scm/linux/kernel/git/brgl/linux.git davinci-current/davinci/for-current +Already up to date. +Merging drivers-memory-fixes/fixes (6613476e225e Linux 6.8-rc1) +$ git merge -m Merge branch 'fixes' of https://git.kernel.org/pub/scm/linux/kernel/git/krzk/linux-mem-ctrl.git drivers-memory-fixes/fixes +Already up to date. +Merging sophgo-fixes/fixes (41bccc98fb79 Linux 6.8-rc2) +$ git merge -m Merge branch 'fixes' of https://github.com/sophgo/linux.git sophgo-fixes/fixes +Already up to date. +Merging tee-fixes/fixes (ceaa837f96ad Linux 6.2-rc8) +$ git merge -m Merge branch 'fixes' of https://git.linaro.org/people/jens.wiklander/linux-tee.git tee-fixes/fixes +Already up to date. +Merging m68k-current/for-linus (6b9c045b0602 m68k: defconfig: Update defconfigs for v6.7-rc1) +$ git merge -m Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/geert/linux-m68k.git m68k-current/for-linus +Already up to date. +Merging powerpc-fixes/fixes (18f14afe2816 powerpc/64s: Increase default stack size to 32KB) +$ git merge -m Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git powerpc-fixes/fixes +Already up to date. +Merging s390-fixes/fixes (6613476e225e Linux 6.8-rc1) +$ git merge -m Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/s390/linux.git s390-fixes/fixes +Already up to date. +Merging fscrypt-current/for-current (4bcf6f827a79 fscrypt: check for NULL keyring in fscrypt_put_master_key_activeref()) +$ git merge -m Merge branch 'for-current' of git://git.kernel.org/pub/scm/fs/fscrypt/linux.git fscrypt-current/for-current +Already up to date. +Merging fsverity-current/for-current (a075bacde257 fsverity: don't drop pagecache at end of FS_IOC_ENABLE_VERITY) +$ git merge -m Merge branch 'for-current' of git://git.kernel.org/pub/scm/fs/fsverity/linux.git fsverity-current/for-current +Already up to date. +Merging net/main (c9ec85153fea selftests: net: add missing config for GENEVE) +$ git merge -m Merge branch 'main' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net.git net/main +Auto-merging MAINTAINERS +Merge made by the 'ort' strategy. + MAINTAINERS | 2 + + drivers/net/dsa/mt7530.c | 3 +- + drivers/net/dsa/qca/qca8k-8xxx.c | 3 +- + drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.c | 2 +- + drivers/net/ethernet/google/gve/gve_rx.c | 8 +- + drivers/net/ethernet/intel/e1000e/e1000.h | 20 +++ + drivers/net/ethernet/intel/e1000e/ptp.c | 22 ++- + drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c | 3 +- + drivers/net/ethernet/mediatek/mtk_eth_soc.c | 5 +- + .../net/ethernet/microchip/lan966x/lan966x_port.c | 5 +- + .../net/ethernet/netronome/nfp/flower/conntrack.c | 46 ++++++- + drivers/net/ethernet/stmicro/stmmac/dwmac-imx.c | 4 + + drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 3 + + drivers/net/phy/mediatek-ge-soc.c | 147 ++++++++++++--------- + include/net/ip.h | 2 +- + net/bridge/br_multicast.c | 20 ++- + net/bridge/br_private.h | 4 +- + net/devlink/port.c | 2 +- + net/hsr/hsr_device.c | 4 +- + net/ipv4/ip_sockglue.c | 6 +- + net/ipv4/ipmr.c | 2 +- + net/ipv4/raw.c | 2 +- + net/ipv4/tcp.c | 12 +- + net/ipv4/udp.c | 2 +- + net/ipv6/addrconf_core.c | 21 ++- + net/ipv6/ip6_tunnel.c | 21 ++- + net/llc/af_llc.c | 2 + + net/nfc/nci/core.c | 4 + + net/smc/smc_core.c | 12 +- + tools/testing/selftests/net/Makefile | 6 +- + tools/testing/selftests/net/config | 9 ++ + tools/testing/selftests/net/lib.sh | 5 +- + tools/testing/selftests/net/setup_veth.sh | 2 +- + tools/testing/selftests/net/tcp_ao/config | 10 ++ + tools/testing/selftests/net/tcp_ao/settings | 1 + + tools/testing/selftests/net/udpgro.sh | 4 +- + tools/testing/selftests/net/udpgro_bench.sh | 4 +- + tools/testing/selftests/net/udpgro_frglist.sh | 6 +- + tools/testing/selftests/net/udpgro_fwd.sh | 8 +- + tools/testing/selftests/net/veth.sh | 4 +- + tools/testing/selftests/net/xdp_dummy.c | 13 ++ + 41 files changed, 326 insertions(+), 135 deletions(-) + create mode 100644 tools/testing/selftests/net/tcp_ao/config + create mode 100644 tools/testing/selftests/net/tcp_ao/settings + create mode 100644 tools/testing/selftests/net/xdp_dummy.c +Merging bpf/master (577e4432f3ac tcp: add sanity checks to rx zerocopy) +$ git merge -m Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf.git bpf/master +Already up to date. +Merging ipsec/master (983a73da1f99 xfrm: Pass UDP encapsulation in TX packet offload) +$ git merge -m Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/klassert/ipsec.git ipsec/master +Merge made by the 'ort' strategy. + net/xfrm/xfrm_device.c | 2 +- + net/xfrm/xfrm_policy.c | 2 +- + 2 files changed, 2 insertions(+), 2 deletions(-) +Merging netfilter/main (a2933a8759a6 selftests: bonding: do not test arp/ns target with mode balance-alb/tlb) +$ git merge -m Merge branch 'main' of git://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf.git netfilter/main +Already up to date. +Merging ipvs/main (a2933a8759a6 selftests: bonding: do not test arp/ns target with mode balance-alb/tlb) +$ git merge -m Merge branch 'main' of git://git.kernel.org/pub/scm/linux/kernel/git/horms/ipvs.git ipvs/main +Already up to date. +Merging wireless/for-next (f3f8f0503168 wifi: fill in MODULE_DESCRIPTION()s for mt76 drivers) +$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/wireless/wireless.git wireless/for-next +Auto-merging MAINTAINERS +Merge made by the 'ort' strategy. + MAINTAINERS | 9 ++++----- + drivers/net/wireless/ath/ar5523/ar5523.c | 1 + + drivers/net/wireless/ath/wcn36xx/main.c | 1 + + drivers/net/wireless/broadcom/brcm80211/brcmfmac/bca/module.c | 1 + + drivers/net/wireless/broadcom/brcm80211/brcmfmac/cyw/module.c | 1 + + drivers/net/wireless/broadcom/brcm80211/brcmfmac/wcc/module.c | 1 + + drivers/net/wireless/intel/iwlwifi/iwl-drv.c | 1 + + drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c | 6 ++++-- + drivers/net/wireless/intel/iwlwifi/mvm/mld-mac80211.c | 3 ++- + drivers/net/wireless/intersil/p54/p54spi.c | 1 + + drivers/net/wireless/mediatek/mt76/mt7603/main.c | 1 + + drivers/net/wireless/mediatek/mt76/mt7615/main.c | 1 + + drivers/net/wireless/mediatek/mt76/mt7615/mmio.c | 1 + + drivers/net/wireless/mediatek/mt76/mt7615/sdio.c | 1 + + drivers/net/wireless/mediatek/mt76/mt7615/usb.c | 1 + + drivers/net/wireless/mediatek/mt76/mt7615/usb_sdio.c | 1 + + drivers/net/wireless/mediatek/mt76/mt76_connac_mcu.c | 1 + + drivers/net/wireless/mediatek/mt76/mt76x0/eeprom.c | 1 + + drivers/net/wireless/mediatek/mt76/mt76x0/pci.c | 1 + + drivers/net/wireless/mediatek/mt76/mt76x0/usb.c | 1 + + drivers/net/wireless/mediatek/mt76/mt76x02_usb_mcu.c | 1 + + drivers/net/wireless/mediatek/mt76/mt76x02_util.c | 1 + + drivers/net/wireless/mediatek/mt76/mt76x2/eeprom.c | 1 + + drivers/net/wireless/mediatek/mt76/mt76x2/pci.c | 1 + + drivers/net/wireless/mediatek/mt76/mt76x2/usb.c | 1 + + drivers/net/wireless/mediatek/mt76/mt7915/mmio.c | 1 + + drivers/net/wireless/mediatek/mt76/mt7921/main.c | 1 + + drivers/net/wireless/mediatek/mt76/mt7921/pci.c | 1 + + drivers/net/wireless/mediatek/mt76/mt7921/sdio.c | 1 + + drivers/net/wireless/mediatek/mt76/mt7921/usb.c | 1 + + drivers/net/wireless/mediatek/mt76/mt7925/main.c | 1 + + drivers/net/wireless/mediatek/mt76/mt7925/pci.c | 1 + + drivers/net/wireless/mediatek/mt76/mt7925/usb.c | 1 + + drivers/net/wireless/mediatek/mt76/mt792x_core.c | 1 + + drivers/net/wireless/mediatek/mt76/mt792x_usb.c | 1 + + drivers/net/wireless/mediatek/mt76/mt7996/mmio.c | 1 + + drivers/net/wireless/mediatek/mt76/sdio.c | 1 + + drivers/net/wireless/mediatek/mt76/usb.c | 1 + + drivers/net/wireless/mediatek/mt76/util.c | 1 + + drivers/net/wireless/microchip/wilc1000/netdev.c | 1 + + drivers/net/wireless/microchip/wilc1000/sdio.c | 1 + + drivers/net/wireless/microchip/wilc1000/spi.c | 1 + + drivers/net/wireless/ti/wl1251/sdio.c | 1 + + drivers/net/wireless/ti/wl1251/spi.c | 1 + + drivers/net/wireless/ti/wl12xx/main.c | 1 + + drivers/net/wireless/ti/wl18xx/main.c | 1 + + drivers/net/wireless/ti/wlcore/main.c | 1 + + drivers/net/wireless/ti/wlcore/sdio.c | 1 + + drivers/net/wireless/ti/wlcore/spi.c | 1 + + net/mac80211/wbrf.c | 2 -- + net/wireless/core.c | 3 ++- + 51 files changed, 58 insertions(+), 11 deletions(-) +Merging wpan/master (b85ea95d0864 Linux 6.7-rc1) +$ git merge -m Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/wpan/wpan.git wpan/master +Already up to date. +Merging rdma-fixes/for-rc (80dde187f734 RDMA/bnxt_re: Add a missing check in bnxt_qplib_query_srq) +$ git merge -m Merge branch 'for-rc' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git rdma-fixes/for-rc +Merge made by the 'ort' strategy. + drivers/infiniband/hw/bnxt_re/ib_verbs.c | 43 +++++++++++++++++++++----------- + drivers/infiniband/hw/bnxt_re/main.c | 3 --- + drivers/infiniband/hw/bnxt_re/qplib_fp.c | 3 ++- + drivers/infiniband/hw/hfi1/pio.c | 6 ++++- + 4 files changed, 35 insertions(+), 20 deletions(-) +Merging sound-current/for-linus (2468e8922d2f ALSA: hda/realtek: Apply headset jack quirk for non-bass alc287 thinkpads) +$ git merge -m Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tiwai/sound.git sound-current/for-linus +Merge made by the 'ort' strategy. + sound/core/pcm.c | 4 ++++ + sound/pci/hda/cs35l41_hda_property.c | 4 ++++ + sound/pci/hda/hda_intel.c | 6 ++++-- + sound/pci/hda/patch_cs8409.c | 1 + + sound/pci/hda/patch_realtek.c | 15 +++++++++++--- + sound/usb/clock.c | 24 ++++++++++++++++++++++- + sound/usb/format.c | 20 +++++++++++++++++++ + sound/usb/midi2.c | 2 +- + sound/usb/quirks.c | 38 +++++++++++++++++++++--------------- + sound/virtio/virtio_card.c | 2 -- + sound/virtio/virtio_ctl_msg.c | 2 -- + sound/virtio/virtio_pcm_msg.c | 2 -- + 12 files changed, 91 insertions(+), 29 deletions(-) +Merging sound-asoc-fixes/for-linus (eeab239d6a24 ASoC: wcd934x: fix an incorrect use of kstrndup()) +$ git merge -m Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/broonie/sound.git sound-asoc-fixes/for-linus +Merge made by the 'ort' strategy. + .../bindings/sound/allwinner,sun4i-a10-spdif.yaml | 5 +- + sound/soc/amd/acp/acp-mach-common.c | 16 +- + sound/soc/amd/acp/acp-sof-mach.c | 4 + + sound/soc/amd/acp/acp3x-es83xx/acp3x-es83xx.c | 8 + + sound/soc/amd/yc/acp6x-mach.c | 7 + + sound/soc/codecs/es8326.c | 186 +++++++++++++++------ + sound/soc/codecs/es8326.h | 3 +- + sound/soc/codecs/lpass-wsa-macro.c | 7 - + sound/soc/codecs/wcd9335.c | 4 - + sound/soc/codecs/wcd934x.c | 3 +- + sound/soc/codecs/wcd938x.c | 8 +- + sound/soc/codecs/wsa883x.c | 6 +- + sound/soc/qcom/sc8280xp.c | 12 +- + sound/soc/soc-core.c | 5 +- + sound/soc/sunxi/sun4i-spdif.c | 5 + + 15 files changed, 203 insertions(+), 76 deletions(-) + mode change 100755 => 100644 sound/soc/codecs/es8326.c +Merging regmap-fixes/for-linus (8b921545ddc6 Merge remote-tracking branch 'regmap/for-6.7' into regmap-linus) +$ git merge -m Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/broonie/regmap.git regmap-fixes/for-linus +Merge made by the 'ort' strategy. +Merging regulator-fixes/for-linus (a3fa9838e814 regulator (max5970): Fix IRQ handler) +$ git merge -m Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/broonie/regulator.git regulator-fixes/for-linus +Merge made by the 'ort' strategy. + drivers/regulator/max5970-regulator.c | 2 +- + drivers/regulator/pwm-regulator.c | 43 +++++++++++++++++++++++++++++++++++ + drivers/regulator/ti-abb-regulator.c | 22 +++++++++++++++--- + 3 files changed, 63 insertions(+), 4 deletions(-) +Merging spi-fixes/for-linus (6500ad28fd5d spi: sh-msiof: avoid integer overflow in constants) +$ git merge -m Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/broonie/spi.git spi-fixes/for-linus +Merge made by the 'ort' strategy. + drivers/spi/spi-sh-msiof.c | 16 ++++++++-------- + 1 file changed, 8 insertions(+), 8 deletions(-) +Merging pci-current/for-linus (925bd5e08106 MAINTAINERS: Add Manivannan Sadhasivam as PCI Endpoint maintainer) +$ git merge -m Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/pci/pci.git pci-current/for-linus +Auto-merging MAINTAINERS +Merge made by the 'ort' strategy. + MAINTAINERS | 3 +- + drivers/pci/bus.c | 49 +++++++++++++-------- + drivers/pci/controller/dwc/pcie-qcom.c | 2 +- + drivers/pci/pci.c | 78 ++++++++++++++++++++++------------ + drivers/pci/pci.h | 4 +- + drivers/pci/pcie/aspm.c | 13 ++++-- + include/linux/pci.h | 5 +++ + 7 files changed, 102 insertions(+), 52 deletions(-) +Merging driver-core.current/driver-core-linus (98323e9d7017 topology: Set capacity_freq_ref in all cases) +$ git merge -m Merge branch 'driver-core-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/driver-core.git driver-core.current/driver-core-linus +Merge made by the 'ort' strategy. + drivers/base/arch_topology.c | 13 ++++++------- + 1 file changed, 6 insertions(+), 7 deletions(-) +Merging tty.current/tty-linus (b35f8dbbce81 serial: max310x: prevent infinite while() loop in port startup) +$ git merge -m Merge branch 'tty-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/tty.git tty.current/tty-linus +Merge made by the 'ort' strategy. + drivers/tty/serial/8250/8250_pci1xxxx.c | 4 +-- + drivers/tty/serial/max310x.c | 53 ++++++++++++++++++++++++++------- + drivers/tty/serial/serial_core.c | 2 +- + include/uapi/linux/serial.h | 13 ++++---- + 4 files changed, 53 insertions(+), 19 deletions(-) +Merging usb.current/usb-linus (f2e5d3de7e1f usb: typec: tcpm: fix the PD disabled case) +$ git merge -m Merge branch 'usb-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/usb.git usb.current/usb-linus +Merge made by the 'ort' strategy. + Documentation/usb/gadget-testing.rst | 22 +++---- + drivers/usb/chipidea/ci.h | 2 + + drivers/usb/chipidea/core.c | 44 +++++++------- + drivers/usb/common/ulpi.c | 2 +- + drivers/usb/core/hub.c | 46 ++++++++++----- + drivers/usb/dwc3/dwc3-pci.c | 4 ++ + drivers/usb/dwc3/gadget.c | 6 +- + drivers/usb/dwc3/host.c | 4 +- + drivers/usb/gadget/function/f_mass_storage.c | 20 ++++++- + drivers/usb/gadget/function/f_ncm.c | 8 +-- + drivers/usb/gadget/udc/pch_udc.c | 1 - + drivers/usb/host/xhci-mem.c | 16 ++--- + drivers/usb/host/xhci-plat.c | 3 + + drivers/usb/host/xhci-ring.c | 80 ++++++++++++++++++++----- + drivers/usb/host/xhci.h | 1 + + drivers/usb/typec/tcpm/tcpm.c | 6 +- + drivers/usb/typec/ucsi/ucsi.c | 2 + + drivers/usb/typec/ucsi/ucsi_acpi.c | 88 +++++++++++++++++++++++++--- + 18 files changed, 264 insertions(+), 91 deletions(-) +Merging usb-serial-fixes/usb-linus (b4a1f4eaf1d7 USB: serial: option: add Fibocom FM101-GL variant) +$ git merge -m Merge branch 'usb-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/johan/usb-serial.git usb-serial-fixes/usb-linus +Merge made by the 'ort' strategy. + drivers/usb/serial/cp210x.c | 1 + + drivers/usb/serial/option.c | 1 + + drivers/usb/serial/qcserial.c | 2 ++ + 3 files changed, 4 insertions(+) +Merging phy/fixes (7104ba0f1958 phy: ti: phy-omap-usb2: Fix NULL pointer dereference for SRP) +$ git merge -m Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/phy/linux-phy.git phy/fixes +Merge made by the 'ort' strategy. + drivers/phy/microchip/lan966x_serdes.c | 2 ++ + drivers/phy/qualcomm/phy-qcom-qmp-usb.c | 30 ++++++++++++++++++++++++++++-- + drivers/phy/renesas/phy-rcar-gen3-usb2.c | 4 ---- + drivers/phy/ti/phy-omap-usb2.c | 4 ++-- + 4 files changed, 32 insertions(+), 8 deletions(-) +Merging staging.current/staging-linus (6613476e225e Linux 6.8-rc1) +$ git merge -m Merge branch 'staging-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/staging.git staging.current/staging-linus +Already up to date. +Merging iio-fixes/fixes-togreg (6f6c72acddf4 iio: move LIGHT_UVA and LIGHT_UVB to the end of iio_modifier) +$ git merge -m Merge branch 'fixes-togreg' of git://git.kernel.org/pub/scm/linux/kernel/git/jic23/iio.git iio-fixes/fixes-togreg +Auto-merging MAINTAINERS +Merge made by the 'ort' strategy. + MAINTAINERS | 8 ++++++++ + drivers/iio/adc/ad7091r8.c | 2 +- + drivers/iio/humidity/Kconfig | 12 ++++++++++++ + drivers/iio/humidity/Makefile | 1 + + drivers/iio/humidity/hdc3020.c | 2 +- + drivers/iio/imu/bno055/Kconfig | 1 + + drivers/iio/industrialio-core.c | 5 ++++- + drivers/iio/magnetometer/rm3100-core.c | 10 ++++++++-- + drivers/iio/pressure/bmp280-spi.c | 1 + + drivers/staging/iio/impedance-analyzer/ad5933.c | 2 +- + include/linux/iio/adc/ad_sigma_delta.h | 4 +++- + include/linux/iio/imu/adis.h | 3 ++- + include/uapi/linux/iio/types.h | 4 ++-- + 13 files changed, 45 insertions(+), 10 deletions(-) +Merging counter-current/counter-current (6613476e225e Linux 6.8-rc1) +$ git merge -m Merge branch 'counter-current' of git://git.kernel.org/pub/scm/linux/kernel/git/wbg/counter.git counter-current/counter-current +Already up to date. +Merging char-misc.current/char-misc-linus (ac9762a74c7c misc: open-dice: Fix spurious lockdep warning) +$ git merge -m Merge branch 'char-misc-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/char-misc.git char-misc.current/char-misc-linus +Merge made by the 'ort' strategy. + drivers/misc/fastrpc.c | 2 +- + drivers/misc/open-dice.c | 2 +- + 2 files changed, 2 insertions(+), 2 deletions(-) +Merging soundwire-fixes/fixes (6613476e225e Linux 6.8-rc1) +$ git merge -m Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/vkoul/soundwire.git soundwire-fixes/fixes +Already up to date. +Merging thunderbolt-fixes/fixes (ec4d82f855ce thunderbolt: Fix setting the CNS bit in ROUTER_CS_5) +$ git merge -m Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/westeri/thunderbolt.git thunderbolt-fixes/fixes +Merge made by the 'ort' strategy. + drivers/thunderbolt/tb_regs.h | 2 +- + drivers/thunderbolt/usb4.c | 2 +- + 2 files changed, 2 insertions(+), 2 deletions(-) +Merging input-current/for-linus (2b9c3eb32a69 Input: bcm5974 - check endpoint type before starting traffic) +$ git merge -m Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/dtor/input.git input-current/for-linus +Merge made by the 'ort' strategy. + drivers/input/joystick/xpad.c | 2 ++ + drivers/input/mouse/bcm5974.c | 20 ++++++++++++++++++++ + drivers/input/touchscreen/goodix.c | 3 ++- + 3 files changed, 24 insertions(+), 1 deletion(-) +Merging crypto-current/master (c5a2f74db71a crypto: caam - fix asynchronous hash) +$ git merge -m Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6.git crypto-current/master +Merge made by the 'ort' strategy. + drivers/crypto/caam/caamalg_qi2.c | 7 +++++-- + drivers/crypto/caam/caamhash.c | 7 +++++-- + drivers/crypto/intel/qat/qat_4xxx/adf_4xxx_hw_data.c | 1 + + 3 files changed, 11 insertions(+), 4 deletions(-) +Merging vfio-fixes/for-linus (4ea95c04fa6b vfio: Drop vfio_file_iommu_group() stub to fudge around a KVM wart) +$ git merge -m Merge branch 'for-linus' of git://github.com/awilliam/linux-vfio.git vfio-fixes/for-linus +Already up to date. +Merging kselftest-fixes/fixes (b54761f6e977 kselftest/seccomp: Report each expectation we assert as a KTAP test) +$ git merge -m Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/shuah/linux-kselftest.git kselftest-fixes/fixes +Already up to date. +Merging modules-fixes/modules-linus (f412eef03938 Documentation: livepatch: module-elf-format: Remove local klp_modinfo definition) +$ git merge -m Merge branch 'modules-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mcgrof/linux.git modules-fixes/modules-linus +Already up to date. +Merging dmaengine-fixes/fixes (a22fe1d6dec7 dmaengine: fix is_slave_direction() return false when DMA_DEV_TO_DEV) +$ git merge -m Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/vkoul/dmaengine.git dmaengine-fixes/fixes +Merge made by the 'ort' strategy. + drivers/dma/at_hdmac.c | 21 +++++++++++---------- + drivers/dma/fsl-dpaa2-qdma/dpaa2-qdma.c | 10 ++++++---- + drivers/dma/fsl-qdma.c | 33 ++++++++++++--------------------- + drivers/dma/ti/edma.c | 10 ++++++++++ + drivers/dma/ti/k3-udma.c | 10 ++++++++-- + include/linux/dmaengine.h | 3 ++- + 6 files changed, 49 insertions(+), 38 deletions(-) +Merging backlight-fixes/for-backlight-fixes (6613476e225e Linux 6.8-rc1) +$ git merge -m Merge branch 'for-backlight-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/lee/backlight.git backlight-fixes/for-backlight-fixes +Already up to date. +Merging mtd-fixes/mtd/fixes (7c1b1906229d mtd: spinand: gigadevice: Fix the get ecc status issue) +$ git merge -m Merge branch 'mtd/fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/mtd/linux.git mtd-fixes/mtd/fixes +Merge made by the 'ort' strategy. + drivers/mtd/nand/spi/gigadevice.c | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) +Merging mfd-fixes/for-mfd-fixes (6613476e225e Linux 6.8-rc1) +$ git merge -m Merge branch 'for-mfd-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/lee/mfd.git mfd-fixes/for-mfd-fixes +Already up to date. +Merging v4l-dvb-fixes/fixes (b32431b75321 media: vb2: refactor setting flags and caps, fix missing cap) +$ git merge -m Merge branch 'fixes' of https://git.linuxtv.org/media_stage.git v4l-dvb-fixes/fixes +Already up to date. +Merging reset-fixes/reset/fixes (4a6756f56bcf reset: Fix crash when freeing non-existent optional resets) +$ git merge -m Merge branch 'reset/fixes' of https://git.pengutronix.de/git/pza/linux reset-fixes/reset/fixes +Already up to date. +Merging mips-fixes/mips-fixes (59be5c358501 mips: Call lose_fpu(0) before initializing fcr31 in mips_set_personality_nan) +$ git merge -m Merge branch 'mips-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/mips/linux.git mips-fixes/mips-fixes +Already up to date. +Merging at91-fixes/at91-fixes (6613476e225e Linux 6.8-rc1) +$ git merge -m Merge branch 'at91-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/at91/linux.git at91-fixes/at91-fixes +Already up to date. +Merging omap-fixes/fixes (9b6a51aab5f5 ARM: dts: Fix occasional boot hang for am3 usb) +$ git merge -m Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tmlind/linux-omap.git omap-fixes/fixes +Already up to date. +Merging kvm-fixes/master (0dd3ee311255 Linux 6.7) +$ git merge -m Merge branch 'master' of git://git.kernel.org/pub/scm/virt/kvm/kvm.git kvm-fixes/master +Already up to date. +Merging kvms390-fixes/master (83303a4c776c KVM: s390: fix cc for successful PQAP) +$ git merge -m Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/kvms390/linux.git kvms390-fixes/master +Auto-merging arch/s390/kvm/vsie.c +Merge made by the 'ort' strategy. + arch/s390/kvm/priv.c | 8 ++++++-- + arch/s390/kvm/vsie.c | 1 - + arch/s390/mm/gmap.c | 1 + + 3 files changed, 7 insertions(+), 3 deletions(-) +Merging hwmon-fixes/hwmon (915644189c22 hwmon: (pmbus/mp2975) Correct comment inside 'mp2975_read_byte_data') +$ git merge -m Merge branch 'hwmon' of git://git.kernel.org/pub/scm/linux/kernel/git/groeck/linux-staging.git hwmon-fixes/hwmon +Merge made by the 'ort' strategy. + drivers/hwmon/gigabyte_waterforce.c | 2 +- + drivers/hwmon/pmbus/mp2975.c | 16 ++++++++++++++++ + 2 files changed, 17 insertions(+), 1 deletion(-) +Merging nvdimm-fixes/libnvdimm-fixes (33908660e814 ACPI: NFIT: Fix incorrect calculation of idt size) +$ git merge -m Merge branch 'libnvdimm-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm.git nvdimm-fixes/libnvdimm-fixes +Already up to date. +Merging cxl-fixes/fixes (6be99530c92c x86/numa: Fix the sort compare func used in numa_fill_memblks()) +$ git merge -m Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/cxl/cxl.git cxl-fixes/fixes +Merge made by the 'ort' strategy. + arch/x86/mm/numa.c | 21 ++++++++------------- + drivers/cxl/core/pci.c | 43 +++++++++++++++++++++++++++++++------------ + include/linux/memblock.h | 2 ++ + mm/memblock.c | 5 +++-- + 4 files changed, 44 insertions(+), 27 deletions(-) +Merging btrfs-fixes/next-fixes (c94bd41cb0b6 Merge branch 'misc-6.8' into next-fixes) +$ git merge -m Merge branch 'next-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux.git btrfs-fixes/next-fixes +Merge made by the 'ort' strategy. +Merging vfs-fixes/fixes (485053bb81c8 fix ufs_get_locked_folio() breakage) +$ git merge -m Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs.git vfs-fixes/fixes +Already up to date. +Merging dma-mapping-fixes/for-linus (d5090484b021 swiotlb: do not try to allocate a TLB bigger than MAX_ORDER pages) +$ git merge -m Merge branch 'for-linus' of git://git.infradead.org/users/hch/dma-mapping.git dma-mapping-fixes/for-linus +Already up to date. +Merging drivers-x86-fixes/fixes (1abdf288b0ef platform/x86: touchscreen_dmi: Add info for the TECLAST X16 Plus tablet) +$ git merge -m Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/pdx86/platform-drivers-x86.git drivers-x86-fixes/fixes +Already up to date. +Merging samsung-krzk-fixes/fixes (eab4f56d3e75 ARM: dts: exynos4212-tab3: add samsung,invert-vclk flag to fimd) +$ git merge -m Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/krzk/linux.git samsung-krzk-fixes/fixes +Already up to date. +Merging pinctrl-samsung-fixes/fixes (6613476e225e Linux 6.8-rc1) +$ git merge -m Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/pinctrl/samsung.git pinctrl-samsung-fixes/fixes +Already up to date. +Merging devicetree-fixes/dt/linus (8f7e91790738 of: property: fix typo in io-channels) +$ git merge -m Merge branch 'dt/linus' of git://git.kernel.org/pub/scm/linux/kernel/git/robh/linux.git devicetree-fixes/dt/linus +Auto-merging Documentation/devicetree/bindings/gpio/xlnx,zynqmp-gpio-modepin.yaml +Auto-merging Documentation/devicetree/bindings/usb/microchip,usb5744.yaml +Merge made by the 'ort' strategy. + Documentation/devicetree/bindings/Makefile | 5 ++++- + Documentation/devicetree/bindings/ata/ceva,ahci-1v84.yaml | 3 ++- + .../devicetree/bindings/display/bridge/nxp,tda998x.yaml | 7 +++++-- + .../devicetree/bindings/gpio/xlnx,zynqmp-gpio-modepin.yaml | 3 ++- + .../devicetree/bindings/reset/xlnx,zynqmp-reset.yaml | 3 ++- + Documentation/devicetree/bindings/tpm/tpm-common.yaml | 2 +- + Documentation/devicetree/bindings/usb/dwc3-xilinx.yaml | 3 ++- + .../devicetree/bindings/usb/microchip,usb5744.yaml | 3 ++- + Documentation/devicetree/bindings/usb/xlnx,usb2.yaml | 3 ++- + drivers/of/property.c | 2 +- + tools/testing/selftests/dt/test_unprobed_devices.sh | 13 +++++++------ + 11 files changed, 30 insertions(+), 17 deletions(-) +Merging dt-krzk-fixes/fixes (6613476e225e Linux 6.8-rc1) +$ git merge -m Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/krzk/linux-dt.git dt-krzk-fixes/fixes +Already up to date. +Merging scsi-fixes/fixes (f4469f385835 scsi: storvsc: Fix ring buffer size calculation) +$ git merge -m Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/mkp/scsi.git scsi-fixes/fixes +Already up to date. +Merging drm-fixes/drm-fixes (41bccc98fb79 Linux 6.8-rc2) +$ git merge -m Merge branch 'drm-fixes' of git://git.freedesktop.org/git/drm/drm.git drm-fixes/drm-fixes +Already up to date. +Merging drm-intel-fixes/for-linux-next-fixes (41bccc98fb79 Linux 6.8-rc2) +$ git merge -m Merge branch 'for-linux-next-fixes' of git://anongit.freedesktop.org/drm-intel drm-intel-fixes/for-linux-next-fixes +Already up to date. +Merging mmc-fixes/fixes (41bccc98fb79 Linux 6.8-rc2) +$ git merge -m Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/ulfh/mmc.git mmc-fixes/fixes +Already up to date. +Merging rtc-fixes/rtc-fixes (08279468a294 rtc: sunplus: fix format string for printing resource) +$ git merge -m Merge branch 'rtc-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/abelloni/linux.git rtc-fixes/rtc-fixes +Already up to date. +Merging gnss-fixes/gnss-linus (41bccc98fb79 Linux 6.8-rc2) +$ git merge -m Merge branch 'gnss-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/johan/gnss.git gnss-fixes/gnss-linus +Already up to date. +Merging hyperv-fixes/hyperv-fixes (564eac2860bd hv_utils: Allow implicit ICTIMESYNCFLAG_SYNC) +$ git merge -m Merge branch 'hyperv-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/hyperv/linux.git hyperv-fixes/hyperv-fixes +Merge made by the 'ort' strategy. + drivers/hv/hv_util.c | 31 ++++++++++++++++++++++++++++++- + 1 file changed, 30 insertions(+), 1 deletion(-) +Merging soc-fsl-fixes/fix (06c2afb862f9 Linux 6.5-rc1) +$ git merge -m Merge branch 'fix' of git://git.kernel.org/pub/scm/linux/kernel/git/leo/linux.git soc-fsl-fixes/fix +Already up to date. +Merging risc-v-fixes/fixes (6613476e225e Linux 6.8-rc1) +$ git merge -m Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/riscv/linux.git risc-v-fixes/fixes +Already up to date. +Merging riscv-dt-fixes/riscv-dt-fixes (6613476e225e Linux 6.8-rc1) +$ git merge -m Merge branch 'riscv-dt-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/conor/linux.git riscv-dt-fixes/riscv-dt-fixes +Already up to date. +Merging riscv-soc-fixes/riscv-soc-fixes (a9d022ae8c4f Merge branch 'riscv-soc-drivers-fixes' into riscv-soc-fixes) +$ git merge -m Merge branch 'riscv-soc-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/conor/linux.git riscv-soc-fixes/riscv-soc-fixes +Merge made by the 'ort' strategy. + drivers/soc/microchip/Kconfig | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) +Merging fpga-fixes/fixes (6613476e225e Linux 6.8-rc1) +$ git merge -m Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/fpga/linux-fpga.git fpga-fixes/fixes +Already up to date. +Merging spdx/spdx-linus (6613476e225e Linux 6.8-rc1) +$ git merge -m Merge branch 'spdx-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/spdx.git spdx/spdx-linus +Already up to date. +Merging gpio-brgl-fixes/gpio/for-current (41bccc98fb79 Linux 6.8-rc2) +$ git merge -m Merge branch 'gpio/for-current' of git://git.kernel.org/pub/scm/linux/kernel/git/brgl/linux.git gpio-brgl-fixes/gpio/for-current +Already up to date. +Merging gpio-intel-fixes/fixes (6613476e225e Linux 6.8-rc1) +$ git merge -m Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/andy/linux-gpio-intel.git gpio-intel-fixes/fixes +Already up to date. +Merging pinctrl-intel-fixes/fixes (6613476e225e Linux 6.8-rc1) +$ git merge -m Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/pinctrl/intel.git pinctrl-intel-fixes/fixes +Already up to date. +Merging erofs-fixes/fixes (d9281660ff3f erofs: relaxed temporary buffers allocation on readahead) +$ git merge -m Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/xiang/erofs.git erofs-fixes/fixes +Already up to date. +Merging kunit-fixes/kunit-fixes (1a9f2c776d14 Documentation: KUnit: Update the instructions on how to test static functions) +$ git merge -m Merge branch 'kunit-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/shuah/linux-kselftest.git kunit-fixes/kunit-fixes +Already up to date. +Merging ubifs-fixes/fixes (2241ab53cbb5 Linux 6.2-rc5) +$ git merge -m Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/rw/ubifs.git ubifs-fixes/fixes +Already up to date. +Merging memblock-fixes/fixes (6a9531c3a880 memblock: fix crash when reserved memory is not added to memory) +$ git merge -m Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/rppt/memblock.git memblock-fixes/fixes +Already up to date. +Merging nfsd-fixes/nfsd-fixes (ccbca118ef1a NFSv4.1: Assign the right value for initval and retries for rpc timeout) +$ git merge -m Merge branch 'nfsd-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/cel/linux nfsd-fixes/nfsd-fixes +Merge made by the 'ort' strategy. + net/sunrpc/svc.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) +Merging renesas-fixes/fixes (9eab43facdad soc: renesas: ARCH_R9A07G043 depends on !RISCV_ISA_ZICBOM) +$ git merge -m Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/geert/renesas-devel.git renesas-fixes/fixes +Already up to date. +Merging perf-current/perf-tools (fdd0ae72b34e perf tools headers: update the asm-generic/unaligned.h copy with the kernel sources) +$ git merge -m Merge branch 'perf-tools' of git://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools perf-current/perf-tools +Merge made by the 'ort' strategy. + tools/arch/x86/include/asm/cpufeatures.h | 8 +- + tools/arch/x86/include/asm/msr-index.h | 8 + + tools/arch/x86/include/uapi/asm/kvm.h | 3 + + tools/arch/x86/lib/memcpy_64.S | 4 +- + tools/arch/x86/lib/memset_64.S | 4 +- + tools/include/asm-generic/unaligned.h | 24 +- + tools/include/uapi/asm-generic/unistd.h | 15 +- + tools/include/uapi/drm/drm.h | 72 +++++- + tools/include/uapi/drm/i915_drm.h | 12 +- + tools/include/uapi/linux/fcntl.h | 3 + + tools/include/uapi/linux/kvm.h | 140 ++++-------- + tools/include/uapi/linux/mount.h | 70 ++++++ + tools/include/uapi/linux/stat.h | 1 + + tools/perf/Documentation/perf-list.txt | 4 + + tools/perf/Makefile.perf | 10 + + tools/perf/builtin-list.c | 211 ++++++++++------- + tools/perf/builtin-record.c | 4 +- + tools/perf/builtin-top.c | 2 +- + .../pmu-events/arch/x86/alderlake/adl-metrics.json | 254 ++++++++++----------- + .../arch/x86/alderlaken/adln-metrics.json | 4 - + .../arch/x86/sapphirerapids/spr-metrics.json | 25 +- + tools/perf/tests/shell/daemon.sh | 34 ++- + tools/perf/tests/shell/list.sh | 21 +- + tools/perf/tests/shell/script.sh | 12 +- + tools/perf/trace/beauty/statx.c | 1 + + tools/perf/util/evlist.c | 9 +- + tools/perf/util/hist.c | 4 +- + tools/perf/util/include/linux/linkage.h | 4 + + tools/perf/util/metricgroup.c | 2 +- + tools/perf/util/print-events.c | 2 +- + tools/perf/util/synthetic-events.c | 4 +- + 31 files changed, 588 insertions(+), 383 deletions(-) +Merging efi-fixes/urgent (aa0e784dea7c efi/libstub: Add one kernel-doc comment) +$ git merge -m Merge branch 'urgent' of git://git.kernel.org/pub/scm/linux/kernel/git/efi/efi.git efi-fixes/urgent +Merge made by the 'ort' strategy. + drivers/firmware/efi/libstub/Makefile | 4 ++-- + drivers/firmware/efi/libstub/alignedmem.c | 1 + + drivers/firmware/efi/libstub/efistub.h | 3 ++- + drivers/firmware/efi/libstub/kaslr.c | 2 +- + drivers/firmware/efi/libstub/randomalloc.c | 12 +++++++----- + drivers/firmware/efi/libstub/x86-stub.c | 25 +++++++++++++++---------- + drivers/firmware/efi/libstub/x86-stub.h | 4 ++-- + drivers/firmware/efi/libstub/zboot.c | 2 +- + 8 files changed, 31 insertions(+), 22 deletions(-) +Merging zstd-fixes/zstd-linus (77618db34645 zstd: Fix array-index-out-of-bounds UBSAN warning) +$ git merge -m Merge branch 'zstd-linus' of https://github.com/terrelln/linux.git zstd-fixes/zstd-linus +Already up to date. +Merging battery-fixes/fixes (d0266d7ab161 Revert "power: supply: qcom_battmgr: Register the power supplies after PDR is up") +$ git merge -m Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/sre/linux-power-supply.git battery-fixes/fixes +Merge made by the 'ort' strategy. + drivers/power/supply/qcom_battmgr.c | 109 ++++++++++++++++-------------------- + 1 file changed, 49 insertions(+), 60 deletions(-) +Merging uml-fixes/fixes (73a23d771033 um: harddog: fix modular build) +$ git merge -m Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/uml/linux.git uml-fixes/fixes +Already up to date. +Merging iommufd-fixes/for-rc (6613476e225e Linux 6.8-rc1) +$ git merge -m Merge branch 'for-rc' of git://git.kernel.org/pub/scm/linux/kernel/git/jgg/iommufd.git iommufd-fixes/for-rc +Already up to date. +Merging rust-fixes/rust-fixes (6613476e225e Linux 6.8-rc1) +$ git merge -m Merge branch 'rust-fixes' of https://github.com/Rust-for-Linux/linux.git rust-fixes/rust-fixes +Already up to date. +Merging v9fs-fixes/fixes/next (6613476e225e Linux 6.8-rc1) +$ git merge -m Merge branch 'fixes/next' of git://git.kernel.org/pub/scm/linux/kernel/git/ericvh/v9fs.git v9fs-fixes/fixes/next +Already up to date. +Merging w1-fixes/fixes (6613476e225e Linux 6.8-rc1) +$ git merge -m Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/krzk/linux-w1.git w1-fixes/fixes +Already up to date. +Merging pmdomain-fixes/fixes (c41336f4d690 pmdomain: mediatek: fix race conditions with genpd) +$ git merge -m Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/ulfh/linux-pm.git pmdomain-fixes/fixes +Merge made by the 'ort' strategy. + drivers/pmdomain/core.c | 2 +- + drivers/pmdomain/mediatek/mtk-pm-domains.c | 15 +++++++-------- + drivers/pmdomain/renesas/r8a77980-sysc.c | 3 ++- + 3 files changed, 10 insertions(+), 10 deletions(-) +Merging overlayfs-fixes/ovl-fixes (420332b94119 ovl: mark xwhiteouts directory with overlay.opaque='x') +$ git merge -m Merge branch 'ovl-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/overlayfs/vfs.git overlayfs-fixes/ovl-fixes +Already up to date. +Merging i2c-host-fixes/i2c/i2c-host-fixes (9189526c46f2 MAINTAINERS: Update i2c host drivers repository) +$ git merge -m Merge branch 'i2c/i2c-host-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/andi.shyti/linux.git i2c-host-fixes/i2c/i2c-host-fixes +Auto-merging MAINTAINERS +Merge made by the 'ort' strategy. + MAINTAINERS | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) +Merging drm-misc-fixes/for-linux-next-fixes (1c1914d6e8c6 dma-buf: heaps: Don't track CMA dma-buf pages under RssFile) +$ git merge -m Merge branch 'for-linux-next-fixes' of git://anongit.freedesktop.org/drm/drm-misc drm-misc-fixes/for-linux-next-fixes +Merge made by the 'ort' strategy. + drivers/dma-buf/heaps/cma_heap.c | 7 +++---- + drivers/gpu/drm/virtio/virtgpu_drv.c | 1 + + 2 files changed, 4 insertions(+), 4 deletions(-) +Merging mm-stable/mm-stable (6613476e225e Linux 6.8-rc1) +$ git merge -m Merge branch 'mm-stable' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm mm-stable/mm-stable +Already up to date. +Merging mm-nonmm-stable/mm-nonmm-stable (6613476e225e Linux 6.8-rc1) +$ git merge -m Merge branch 'mm-nonmm-stable' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm mm-nonmm-stable/mm-nonmm-stable +Already up to date. +Merging mm/mm-everything (ec2dacea82ce Merge branch 'mm-nonmm-unstable' into mm-everything) +$ git merge -m Merge branch 'mm-everything' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm mm/mm-everything +Auto-merging Makefile +Auto-merging arch/arm64/kernel/Makefile +Auto-merging arch/x86/Makefile +Auto-merging arch/x86/kernel/alternative.c +Auto-merging drivers/firmware/efi/libstub/Makefile +Auto-merging include/linux/sched.h +Auto-merging net/bridge/br_multicast.c +Merge made by the 'ort' strategy. + Documentation/ABI/testing/sysfs-bus-dax | 153 ++ + .../ABI/testing/sysfs-kernel-mm-mempolicy | 4 + + .../sysfs-kernel-mm-mempolicy-weighted-interleave | 25 + + Documentation/admin-guide/cgroup-v2.rst | 18 +- + Documentation/admin-guide/kdump/vmcoreinfo.rst | 8 +- + Documentation/admin-guide/mm/damon/usage.rst | 42 +- + .../admin-guide/mm/numa_memory_policy.rst | 9 + + Documentation/admin-guide/sysctl/kernel.rst | 14 +- + Documentation/process/changes.rst | 2 +- + .../zh_CN/admin-guide/mm/damon/usage.rst | 20 +- + .../zh_TW/admin-guide/mm/damon/usage.rst | 20 +- + Makefile | 8 - + arch/Kconfig | 2 +- + arch/arm/Kconfig.debug | 2 +- + arch/arm/configs/aspeed_g4_defconfig | 2 +- + arch/arm/configs/aspeed_g5_defconfig | 2 +- + arch/arm/include/asm/current.h | 8 +- + arch/arm/include/asm/ptdump.h | 6 +- + arch/arm/kernel/setup.c | 4 +- + arch/arm/mm/init.c | 2 +- + arch/arm64/Kconfig | 12 +- + .../include/asm/{crash_core.h => crash_reserve.h} | 4 +- + arch/arm64/include/asm/esr.h | 4 + + arch/arm64/include/asm/kexec.h | 2 +- + arch/arm64/include/asm/pgtable.h | 8 + + arch/arm64/include/asm/ptdump.h | 7 - + arch/arm64/include/asm/tlbflush.h | 16 + + arch/arm64/kernel/Makefile | 2 +- + arch/arm64/kernel/machine_kexec.c | 2 +- + arch/arm64/kernel/machine_kexec_file.c | 10 +- + arch/arm64/kernel/{crash_core.c => vmcore_info.c} | 3 +- + arch/arm64/mm/fault.c | 78 +- + arch/arm64/mm/init.c | 2 +- + arch/arm64/mm/mmu.c | 30 +- + arch/arm64/mm/ptdump.c | 11 +- + arch/loongarch/kernel/setup.c | 2 +- + arch/mips/kernel/setup.c | 17 +- + arch/powerpc/Kconfig | 11 +- + arch/powerpc/Makefile | 4 +- + arch/powerpc/kernel/setup-common.c | 2 +- + arch/powerpc/kvm/book3s_hv_nested.c | 2 +- + arch/powerpc/mm/hugetlbpage.c | 2 +- + arch/powerpc/mm/mmu_decl.h | 6 - + arch/powerpc/mm/nohash/kaslr_booke.c | 4 +- + arch/powerpc/mm/pgtable_32.c | 4 - + arch/powerpc/mm/pgtable_64.c | 3 - + arch/powerpc/mm/ptdump/ptdump.c | 21 +- + arch/powerpc/platforms/powernv/opal-core.c | 2 +- + arch/riscv/Kconfig | 6 +- + .../include/asm/{crash_core.h => crash_reserve.h} | 4 +- + arch/riscv/include/asm/ftrace.h | 14 +- + arch/riscv/include/asm/ptdump.h | 22 - + arch/riscv/kernel/Makefile | 2 +- + arch/riscv/kernel/elf_kexec.c | 9 +- + arch/riscv/kernel/mcount.S | 10 +- + arch/riscv/kernel/{crash_core.c => vmcore_info.c} | 3 +- + arch/riscv/mm/init.c | 5 +- + arch/riscv/mm/ptdump.c | 12 +- + arch/s390/Kconfig | 1 + + arch/s390/include/asm/ftrace.h | 2 +- + arch/s390/include/asm/ptdump.h | 14 - + arch/s390/kernel/kexec_elf.c | 2 + + arch/s390/kernel/kexec_image.c | 2 + + arch/s390/kernel/machine_kexec_file.c | 10 + + arch/s390/mm/dump_pagetables.c | 21 +- + arch/s390/mm/init.c | 5 - + arch/s390/mm/pgtable.c | 4 +- + arch/s390/mm/vmem.c | 62 +- + arch/sh/kernel/machine_kexec.c | 3 + + arch/sh/kernel/setup.c | 2 +- + arch/x86/Kconfig | 2 +- + arch/x86/Makefile | 6 - + .../include/asm/{crash_core.h => crash_reserve.h} | 6 +- + arch/x86/include/asm/mmu.h | 2 +- + arch/x86/include/asm/pgtable.h | 5 +- + arch/x86/kernel/Makefile | 6 +- + arch/x86/kernel/alternative.c | 2 +- + arch/x86/kernel/cpu/mshyperv.c | 4 + + arch/x86/kernel/kexec-bzimage64.c | 4 + + arch/x86/kernel/kvm.c | 4 +- + arch/x86/kernel/machine_kexec_64.c | 3 + + arch/x86/kernel/reboot.c | 2 +- + arch/x86/kernel/setup.c | 2 +- + arch/x86/kernel/smp.c | 2 +- + .../kernel/{crash_core_32.c => vmcore_info_32.c} | 2 +- + .../kernel/{crash_core_64.c => vmcore_info_64.c} | 2 +- + arch/x86/mm/dump_pagetables.c | 24 +- + arch/x86/mm/init_32.c | 2 - + arch/x86/mm/init_64.c | 2 - + arch/x86/mm/tlb.c | 37 +- + arch/x86/power/Makefile | 2 +- + arch/x86/xen/enlighten_hvm.c | 4 + + arch/x86/xen/mmu_pv.c | 2 +- + crypto/blake2b_generic.c | 2 +- + drivers/android/binder.c | 4 +- + drivers/base/cacheinfo.c | 50 +- + drivers/base/cpu.c | 6 +- + drivers/base/memory.c | 23 +- + drivers/cpuidle/cpuidle.c | 2 +- + drivers/dax/bus.c | 295 +++- + drivers/firmware/efi/libstub/Makefile | 2 +- + drivers/firmware/qemu_fw_cfg.c | 14 +- + drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 2 +- + drivers/md/bcache/sysfs.c | 8 +- + drivers/media/test-drivers/vicodec/codec-fwht.c | 2 +- + drivers/regulator/Kconfig | 2 +- + drivers/s390/char/sclp_cmd.c | 44 +- + fs/Kconfig | 1 + + fs/nilfs2/alloc.c | 89 +- + fs/nilfs2/bmap.c | 3 - + fs/nilfs2/cpfile.c | 323 ++-- + fs/nilfs2/cpfile.h | 10 +- + fs/nilfs2/dat.c | 38 +- + fs/nilfs2/ifile.c | 21 +- + fs/nilfs2/ifile.h | 10 +- + fs/nilfs2/inode.c | 44 +- + fs/nilfs2/mdt.c | 4 +- + fs/nilfs2/nilfs.h | 3 +- + fs/nilfs2/page.c | 8 +- + fs/nilfs2/segbuf.c | 4 +- + fs/nilfs2/segment.c | 121 +- + fs/nilfs2/sufile.c | 86 +- + fs/nilfs2/super.c | 31 +- + fs/ocfs2/dlmglue.c | 2 +- + fs/proc/Kconfig | 2 +- + fs/proc/kcore.c | 2 +- + fs/proc/task_mmu.c | 17 +- + fs/userfaultfd.c | 2 +- + include/asm-generic/vmlinux.lds.h | 2 +- + include/linux/buildid.h | 2 +- + include/linux/compiler-clang.h | 10 +- + include/linux/crash_core.h | 158 +- + include/linux/crash_reserve.h | 48 + + include/linux/flex_proportions.h | 32 - + include/linux/gfp.h | 2 +- + include/linux/highmem.h | 14 + + include/linux/hugetlb.h | 2 +- + include/linux/kexec.h | 47 +- + include/linux/list.h | 15 + + include/linux/list_lru.h | 18 - + include/linux/memory.h | 9 + + include/linux/memory_hotplug.h | 24 +- + include/linux/memremap.h | 1 + + include/linux/min_heap.h | 42 +- + include/linux/mm.h | 12 +- + include/linux/mmu_context.h | 2 +- + include/linux/moduleloader.h | 8 + + include/linux/padata.h | 2 + + include/linux/poison.h | 3 + + include/linux/ptdump.h | 10 + + include/linux/sched.h | 1 + + include/linux/start_kernel.h | 2 - + include/linux/swap.h | 5 +- + include/linux/swapops.h | 13 + + include/linux/vmalloc.h | 1 - + include/linux/vmcore_info.h | 81 + + include/linux/win_minmax.h | 4 +- + include/linux/zswap.h | 7 +- + include/trace/events/oom.h | 19 +- + include/uapi/linux/mempolicy.h | 1 + + init/initramfs.c | 2 +- + init/main.c | 16 +- + ipc/ipc_sysctl.c | 37 +- + ipc/mq_sysctl.c | 36 + + kernel/Kconfig.kexec | 12 +- + kernel/Makefile | 5 +- + kernel/bounds.c | 2 +- + kernel/crash_core.c | 816 +++------- + kernel/crash_reserve.c | 464 ++++++ + kernel/{crash_dump.c => elfcorehdr.c} | 0 + kernel/events/uprobes.c | 2 +- + kernel/kallsyms_selftest.c | 1 - + kernel/kexec.c | 11 +- + kernel/kexec_core.c | 250 +-- + kernel/kexec_file.c | 13 +- + kernel/kexec_internal.h | 2 + + kernel/kprobes.c | 4 +- + kernel/ksysfs.c | 10 +- + kernel/module/main.c | 5 + + kernel/padata.c | 14 +- + kernel/panic.c | 5 + + kernel/printk/printk.c | 4 +- + kernel/ptrace.c | 13 +- + kernel/user_namespace.c | 2 +- + kernel/vmcore_info.c | 230 +++ + lib/Kconfig.debug | 4 +- + lib/Kconfig.kasan | 2 +- + lib/buildid.c | 2 +- + lib/dhry_1.c | 2 +- + lib/dhry_run.c | 1 - + lib/flex_proportions.c | 77 - + lib/iov_iter.c | 5 +- + lib/maple_tree.c | 6 +- + lib/raid6/Makefile | 2 +- + lib/sort.c | 20 +- + lib/stackdepot.c | 254 +-- + lib/stackinit_kunit.c | 2 +- + mm/cma.c | 8 +- + mm/compaction.c | 222 ++- + mm/damon/Kconfig | 7 +- + mm/damon/dbgfs.c | 27 +- + mm/filemap.c | 4 +- + mm/huge_memory.c | 23 +- + mm/hugetlb.c | 234 ++- + mm/hugetlb_vmemmap.c | 55 +- + mm/internal.h | 18 +- + mm/kasan/common.c | 8 +- + mm/kasan/generic.c | 68 +- + mm/kasan/kasan.h | 10 - + mm/kasan/quarantine.c | 5 +- + mm/khugepaged.c | 22 +- + mm/kmsan/hooks.c | 50 +- + mm/list_lru.c | 17 +- + mm/memcontrol.c | 150 +- + mm/memory.c | 76 +- + mm/memory_hotplug.c | 34 +- + mm/mempolicy.c | 484 +++++- + mm/mm_init.c | 1 + + mm/mmap.c | 83 +- + mm/mprotect.c | 4 +- + mm/nommu.c | 2 - + mm/oom_kill.c | 6 +- + mm/page_alloc.c | 89 +- + mm/ptdump.c | 22 + + mm/readahead.c | 6 +- + mm/rmap.c | 10 +- + mm/slab_common.c | 2 +- + mm/sparse.c | 3 +- + mm/swapfile.c | 28 +- + mm/userfaultfd.c | 2 +- + mm/vmalloc.c | 1086 +++++++++---- + mm/vmscan.c | 51 +- + mm/zswap.c | 1641 ++++++++++---------- + net/bridge/br_multicast.c | 2 +- + scripts/min-tool-version.sh | 2 +- + scripts/recordmcount.pl | 2 +- + security/Kconfig | 2 - + tools/mm/Makefile | 9 +- + tools/mm/thpmaps | 675 ++++++++ + tools/objtool/noreturns.h | 1 - + tools/testing/selftests/damon/_chk_dependency.sh | 11 +- + tools/testing/selftests/damon/_debugfs_common.sh | 7 + + .../selftests/damon/debugfs_empty_targets.sh | 12 +- + .../selftests/filesystems/eventfd/.gitignore | 2 + + .../testing/selftests/filesystems/eventfd/Makefile | 7 + + .../selftests/filesystems/eventfd/eventfd_test.c | 186 +++ + tools/testing/selftests/memfd/memfd_test.c | 10 - + tools/testing/selftests/mm/.gitignore | 1 + + tools/testing/selftests/mm/Makefile | 6 + + .../selftests/mm/charge_reserved_hugetlb.sh | 4 + + tools/testing/selftests/mm/config | 3 + + tools/testing/selftests/mm/hugepage-shm.c | 49 +- + tools/testing/selftests/mm/hugepage-vmemmap.c | 39 +- + tools/testing/selftests/mm/hugetlb-madvise.c | 207 +-- + tools/testing/selftests/mm/hugetlb-read-hwpoison.c | 116 +- + tools/testing/selftests/mm/hugetlb_madv_vs_map.c | 124 ++ + .../selftests/mm/hugetlb_reparenting_test.sh | 9 +- + tools/testing/selftests/mm/khugepaged.c | 385 ++--- + tools/testing/selftests/mm/ksm_functional_tests.c | 4 +- + tools/testing/selftests/mm/on-fault-limit.c | 38 +- + tools/testing/selftests/mm/protection_keys.c | 34 + + tools/testing/selftests/mm/run_vmtests.sh | 19 +- + 262 files changed, 7348 insertions(+), 4347 deletions(-) + create mode 100644 Documentation/ABI/testing/sysfs-bus-dax + create mode 100644 Documentation/ABI/testing/sysfs-kernel-mm-mempolicy + create mode 100644 Documentation/ABI/testing/sysfs-kernel-mm-mempolicy-weighted-interleave + rename arch/arm64/include/asm/{crash_core.h => crash_reserve.h} (81%) + rename arch/arm64/kernel/{crash_core.c => vmcore_info.c} (92%) + rename arch/riscv/include/asm/{crash_core.h => crash_reserve.h} (78%) + delete mode 100644 arch/riscv/include/asm/ptdump.h + rename arch/riscv/kernel/{crash_core.c => vmcore_info.c} (88%) + delete mode 100644 arch/s390/include/asm/ptdump.h + rename arch/x86/include/asm/{crash_core.h => crash_reserve.h} (92%) + rename arch/x86/kernel/{crash_core_32.c => vmcore_info_32.c} (90%) + rename arch/x86/kernel/{crash_core_64.c => vmcore_info_64.c} (94%) + create mode 100644 include/linux/crash_reserve.h + create mode 100644 include/linux/vmcore_info.h + create mode 100644 kernel/crash_reserve.c + rename kernel/{crash_dump.c => elfcorehdr.c} (100%) + create mode 100644 kernel/vmcore_info.c + create mode 100644 tools/mm/thpmaps + create mode 100644 tools/testing/selftests/filesystems/eventfd/.gitignore + create mode 100644 tools/testing/selftests/filesystems/eventfd/Makefile + create mode 100644 tools/testing/selftests/filesystems/eventfd/eventfd_test.c + create mode 100644 tools/testing/selftests/mm/hugetlb_madv_vs_map.c +Merging kbuild/for-next (bd768db42ef6 kbuild: deb-pkg: call more misc debhelper commands) +$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/masahiroy/linux-kbuild.git kbuild/for-next +Merge made by the 'ort' strategy. + Documentation/kbuild/kconfig.rst | 363 ++++++++++++++++++--------------------- + scripts/kconfig/lexer.l | 38 ++-- + scripts/package/builddeb | 48 ++---- + scripts/package/debian/rules | 63 ++++++- + 4 files changed, 249 insertions(+), 263 deletions(-) +Merging clang-format/clang-format (5a205c6a9f79 clang-format: Update with v6.7-rc4's `for_each` macro list) +$ git merge -m Merge branch 'clang-format' of https://github.com/ojeda/linux.git clang-format/clang-format +Already up to date. +Merging perf/perf-tools-next (7727d59de44e perf tools: Add -H short option for --hierarchy) + 63f209b6fa4d ("perf evlist: Fix evlist__new_default() for > 1 core PMU") +$ git merge -m Merge branch 'perf-tools-next' of git://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools-next.git perf/perf-tools-next +Auto-merging tools/perf/builtin-record.c +Auto-merging tools/perf/builtin-top.c +Merge made by the 'ort' strategy. + tools/perf/Documentation/perf-report.txt | 29 +++- + tools/perf/Documentation/perf-top.txt | 32 +++- + tools/perf/Makefile.config | 6 + + tools/perf/arch/arm/util/pmu.c | 3 + + tools/perf/arch/arm64/util/mem-events.c | 39 +---- + tools/perf/arch/arm64/util/mem-events.h | 7 + + tools/perf/arch/powerpc/util/Build | 1 + + tools/perf/arch/powerpc/util/mem-events.c | 16 +- + tools/perf/arch/powerpc/util/mem-events.h | 7 + + tools/perf/arch/powerpc/util/pmu.c | 12 ++ + tools/perf/arch/x86/util/mem-events.c | 99 ++--------- + tools/perf/arch/x86/util/mem-events.h | 10 ++ + tools/perf/arch/x86/util/pmu.c | 19 +- + tools/perf/builtin-c2c.c | 45 ++--- + tools/perf/builtin-mem.c | 48 ++--- + tools/perf/builtin-record.c | 14 +- + tools/perf/builtin-report.c | 2 +- + tools/perf/builtin-sched.c | 57 ++---- + tools/perf/builtin-top.c | 2 +- + tools/perf/builtin-version.c | 1 + + tools/perf/tests/shell/stat_bpf_counters.sh | 12 +- + tools/perf/tests/shell/test_arm_callgraph_fp.sh | 6 + + tools/perf/util/annotate-data.c | 119 +++++++++++-- + tools/perf/util/annotate-data.h | 8 +- + tools/perf/util/annotate.c | 153 ++++++++++++++-- + tools/perf/util/annotate.h | 12 +- + tools/perf/util/data.c | 10 +- + tools/perf/util/data.h | 6 +- + tools/perf/util/dwarf-aux.c | 187 +++++++++++++++++--- + tools/perf/util/dwarf-aux.h | 18 ++ + tools/perf/util/evsel.c | 146 ++++++++++++++++ + tools/perf/util/evsel.h | 1 + + tools/perf/util/mem-events.c | 221 ++++++++++++++---------- + tools/perf/util/mem-events.h | 19 +- + tools/perf/util/pmu.c | 16 +- + tools/perf/util/pmu.h | 7 + + tools/perf/util/pmus.c | 6 - + tools/perf/util/pmus.h | 1 - + 38 files changed, 985 insertions(+), 412 deletions(-) + create mode 100644 tools/perf/arch/arm64/util/mem-events.h + create mode 100644 tools/perf/arch/powerpc/util/mem-events.h + create mode 100644 tools/perf/arch/powerpc/util/pmu.c + create mode 100644 tools/perf/arch/x86/util/mem-events.h +Merging compiler-attributes/compiler-attributes (2993eb7a8d34 Compiler Attributes: counted_by: fixup clang URL) +$ git merge -m Merge branch 'compiler-attributes' of https://github.com/ojeda/linux.git compiler-attributes/compiler-attributes +Merge made by the 'ort' strategy. + include/linux/compiler_attributes.h | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) +Merging dma-mapping/for-next (7c65aa3cc072 dma-debug: fix kernel-doc warnings) +$ git merge -m Merge branch 'for-next' of git://git.infradead.org/users/hch/dma-mapping.git dma-mapping/for-next +Already up to date. +Merging asm-generic/master (34b2321cc648 MAINTAINERS: Add Andreas Larsson as co-maintainer for arch/sparc) +$ git merge -m Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/arnd/asm-generic.git asm-generic/master +Auto-merging MAINTAINERS +Merge made by the 'ort' strategy. +Merging arc/for-next (0bb80ecc33a8 Linux 6.6-rc1) +$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/vgupta/arc.git arc/for-next +Already up to date. +Merging arm/for-next (8790fade1a19 Merge branches 'misc' and 'fixes' into for-next) +$ git merge -m Merge branch 'for-next' of git://git.armlinux.org.uk/~rmk/linux-arm.git arm/for-next +Already up to date. +Merging arm64/for-next/core (1b20d0486a60 arm64: Fix silcon-errata.rst formatting) +$ git merge -m Merge branch 'for-next/core' of git://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux arm64/for-next/core +Already up to date. +Merging arm-perf/for-next/perf (bb339db4d363 arm: perf: Fix ARCH=arm build with GCC) +$ git merge -m Merge branch 'for-next/perf' of git://git.kernel.org/pub/scm/linux/kernel/git/will/linux.git arm-perf/for-next/perf +Already up to date. +Merging arm-soc/for-next (0d1d824a4ac1 Merge tag 'samsung-fixes-6.8' of https://git.kernel.org/pub/scm/linux/kernel/git/krzk/linux into arm/fixes) +$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/soc/soc.git arm-soc/for-next +Already up to date. +Merging amlogic/for-next (0dd3ee311255 Linux 6.7) +$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/amlogic/linux.git amlogic/for-next +Already up to date. +Merging asahi-soc/asahi-soc/for-next (ffc253263a13 Linux 6.6) +$ git merge -m Merge branch 'asahi-soc/for-next' of https://github.com/AsahiLinux/linux.git asahi-soc/asahi-soc/for-next +Already up to date. +Merging aspeed/for-next (e60f7a99d378 ARM: dts: aspeed: minerva: add sgpio line name) +$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/joel/bmc.git aspeed/for-next +Merge made by the 'ort' strategy. + .../devicetree/bindings/arm/aspeed/aspeed.yaml | 4 + + arch/arm/boot/dts/aspeed/Makefile | 6 +- + .../dts/aspeed/aspeed-bmc-asrock-e3c256d4i.dts | 322 ++++++++++++ + .../dts/aspeed/aspeed-bmc-asrock-spc621d8hm3.dts | 324 ++++++++++++ + .../boot/dts/aspeed/aspeed-bmc-asrock-x570d4u.dts | 377 +++++++++++++ + .../boot/dts/aspeed/aspeed-bmc-facebook-harma.dts | 585 +++++++++++++++++++++ + .../dts/aspeed/aspeed-bmc-facebook-minerva-cmc.dts | 265 ---------- + .../dts/aspeed/aspeed-bmc-facebook-minerva.dts | 543 +++++++++++++++++++ + 8 files changed, 2160 insertions(+), 266 deletions(-) + create mode 100644 arch/arm/boot/dts/aspeed/aspeed-bmc-asrock-e3c256d4i.dts + create mode 100644 arch/arm/boot/dts/aspeed/aspeed-bmc-asrock-spc621d8hm3.dts + create mode 100644 arch/arm/boot/dts/aspeed/aspeed-bmc-asrock-x570d4u.dts + create mode 100644 arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-harma.dts + delete mode 100644 arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva-cmc.dts + create mode 100644 arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts +Merging at91/at91-next (6613476e225e Linux 6.8-rc1) +$ git merge -m Merge branch 'at91-next' of git://git.kernel.org/pub/scm/linux/kernel/git/at91/linux.git at91/at91-next +Already up to date. +Merging broadcom/next (bbf6d7dc2d94 Merge branch 'soc/next' into next) +$ git merge -m Merge branch 'next' of https://github.com/Broadcom/stblinux.git broadcom/next +Merge made by the 'ort' strategy. + Documentation/devicetree/bindings/bus/brcm,gisb-arb.yaml | 1 + + arch/arm/include/debug/brcmstb.S | 8 +++++--- + .../boot/dts/broadcom/bcmbca/bcm4908-asus-gt-ac5300.dts | 13 +++++++------ + arch/arm64/boot/dts/broadcom/bcmbca/bcm4908.dtsi | 3 --- + drivers/bus/brcmstb_gisb.c | 15 +++++++++++++++ + 5 files changed, 28 insertions(+), 12 deletions(-) +Merging davinci/davinci/for-next (6613476e225e Linux 6.8-rc1) +$ git merge -m Merge branch 'davinci/for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/brgl/linux.git davinci/davinci/for-next +Already up to date. +Merging drivers-memory/for-next (2f542c937c48 dt-bindings: memory-controllers: narrow regex for unit address to hex numbers) +$ git merge -m Merge branch 'for-next' of https://git.kernel.org/pub/scm/linux/kernel/git/krzk/linux-mem-ctrl.git drivers-memory/for-next +Merge made by the 'ort' strategy. + .../memory-controllers/nvidia,tegra20-emc.yaml | 2 +- + drivers/memory/emif.c | 65 ++++++++-------------- + 2 files changed, 25 insertions(+), 42 deletions(-) +Merging imx-mxs/for-next (4db02d61a81e Merge branch 'imx/dt64' into for-next) +$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/shawnguo/linux.git imx-mxs/for-next +Merge made by the 'ort' strategy. +Merging mediatek/for-next (9802b60bd6d8 Merge branch 'v6.6-next/soc' into for-next) +$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/matthias.bgg/linux.git mediatek/for-next +Merge made by the 'ort' strategy. +Merging mvebu/for-next (476887312c60 Merge branch 'mvebu/drivers' into mvebu/for-next) +$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/gclement/mvebu.git mvebu/for-next +Merge made by the 'ort' strategy. +Merging omap/for-next (0012c1958460 Merge branches 'sgx-for-v6.9' and 'omap-for-v6.9/soc' into for-next) +$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/tmlind/linux-omap.git omap/for-next +Auto-merging MAINTAINERS +Merge made by the 'ort' strategy. + .../{img,powervr.yaml => img,powervr-rogue.yaml} | 4 +- + .../devicetree/bindings/gpu/img,powervr-sgx.yaml | 138 +++++++++++++++++++++ + MAINTAINERS | 3 +- + arch/arm/boot/dts/ti/omap/am33xx.dtsi | 9 +- + arch/arm/boot/dts/ti/omap/am3517.dtsi | 11 +- + arch/arm/boot/dts/ti/omap/am4372.dtsi | 6 + + arch/arm/boot/dts/ti/omap/dra7.dtsi | 9 +- + arch/arm/boot/dts/ti/omap/omap34xx.dtsi | 11 +- + arch/arm/boot/dts/ti/omap/omap36xx.dtsi | 9 +- + arch/arm/boot/dts/ti/omap/omap4.dtsi | 9 +- + arch/arm/boot/dts/ti/omap/omap5.dtsi | 9 +- + arch/arm/mach-omap2/am33xx-restart.c | 2 +- + arch/arm/mach-omap2/clkt2xxx_virt_prcm_set.c | 2 +- + arch/arm/mach-omap2/clockdomain.c | 4 +- + arch/arm/mach-omap2/cm33xx.c | 2 +- + arch/arm/mach-omap2/cminst44xx.c | 2 +- + arch/arm/mach-omap2/omap-secure.c | 4 +- + arch/arm/mach-omap2/omap_hwmod.c | 9 +- + arch/arm/mach-omap2/omap_hwmod_common_data.c | 6 +- + arch/arm/mach-omap2/pmic-cpcap.c | 24 ++-- + arch/arm/mach-omap2/powerdomain.c | 2 +- + arch/arm/mach-omap2/prm44xx.c | 2 +- + arch/arm/mach-omap2/prm_common.c | 4 +- + arch/arm/mach-omap2/wd_timer.c | 4 +- + arch/arm64/boot/dts/ti/k3-am65-main.dtsi | 7 ++ + 25 files changed, 231 insertions(+), 61 deletions(-) + rename Documentation/devicetree/bindings/gpu/{img,powervr.yaml => img,powervr-rogue.yaml} (91%) + create mode 100644 Documentation/devicetree/bindings/gpu/img,powervr-sgx.yaml +Merging qcom/for-next (f70a1e7dd74f Merge branches 'arm32-for-6.9', 'arm64-fixes-for-6.8', 'arm64-for-6.9', 'clk-for-6.9' and 'drivers-for-6.9' into for-next) +$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/qcom/linux.git qcom/for-next +Merge made by the 'ort' strategy. + Documentation/devicetree/bindings/arm/qcom.yaml | 5 + + .../bindings/clock/qcom,gcc-sc8180x.yaml | 7 + + .../bindings/soc/qcom/qcom,rpm-master-stats.yaml | 2 + + .../dts/qcom/qcom-apq8026-samsung-matisse-wifi.dts | 7 + + arch/arm/boot/dts/qcom/qcom-apq8064.dtsi | 38 +- + arch/arm/boot/dts/qcom/qcom-ipq4019-ap.dk01.1.dtsi | 146 ++- + arch/arm/boot/dts/qcom/qcom-ipq4019.dtsi | 10 +- + arch/arm/boot/dts/qcom/qcom-ipq8064.dtsi | 2 - + arch/arm/boot/dts/qcom/qcom-msm8660.dtsi | 17 +- + arch/arm/boot/dts/qcom/qcom-msm8926-htc-memul.dts | 15 +- + arch/arm/boot/dts/qcom/qcom-msm8974.dtsi | 18 +- + arch/arm/boot/dts/qcom/qcom-sdx55.dtsi | 32 +- + arch/arm/boot/dts/qcom/qcom-sdx65.dtsi | 48 +- + arch/arm64/boot/dts/qcom/Makefile | 1 + + .../dts/qcom/apq8016-sbc-d3-camera-mezzanine.dts | 8 +- + arch/arm64/boot/dts/qcom/ipq5332.dtsi | 8 +- + arch/arm64/boot/dts/qcom/ipq6018.dtsi | 13 + + arch/arm64/boot/dts/qcom/ipq8074.dtsi | 14 + + arch/arm64/boot/dts/qcom/msm8916.dtsi | 9 + + arch/arm64/boot/dts/qcom/msm8939.dtsi | 9 + + arch/arm64/boot/dts/qcom/msm8953.dtsi | 7 +- + arch/arm64/boot/dts/qcom/msm8996.dtsi | 8 +- + arch/arm64/boot/dts/qcom/msm8998.dtsi | 7 +- + .../boot/dts/qcom/{pm2250.dtsi => pm4125.dtsi} | 8 +- + arch/arm64/boot/dts/qcom/qcm6490-fairphone-fp5.dts | 56 +- + arch/arm64/boot/dts/qcom/qcs404.dtsi | 16 + + arch/arm64/boot/dts/qcom/qrb2210-rb1.dts | 78 +- + arch/arm64/boot/dts/qcom/sa8775p.dtsi | 16 +- + arch/arm64/boot/dts/qcom/sc7180.dtsi | 14 +- + arch/arm64/boot/dts/qcom/sc7280-chrome-common.dtsi | 11 + + arch/arm64/boot/dts/qcom/sc7280.dtsi | 54 +- + arch/arm64/boot/dts/qcom/sc8180x.dtsi | 63 +- + arch/arm64/boot/dts/qcom/sdm630.dtsi | 26 +- + arch/arm64/boot/dts/qcom/sdm670.dtsi | 14 +- + .../arm64/boot/dts/qcom/sdm845-oneplus-common.dtsi | 8 +- + arch/arm64/boot/dts/qcom/sdm845.dtsi | 47 +- + arch/arm64/boot/dts/qcom/sm6115.dtsi | 22 +- + arch/arm64/boot/dts/qcom/sm6125.dtsi | 9 +- + arch/arm64/boot/dts/qcom/sm6350.dtsi | 13 +- + arch/arm64/boot/dts/qcom/sm6375.dtsi | 12 +- + arch/arm64/boot/dts/qcom/sm7225-fairphone-fp4.dts | 8 +- + arch/arm64/boot/dts/qcom/sm8150.dtsi | 95 +- + arch/arm64/boot/dts/qcom/sm8250.dtsi | 93 +- + arch/arm64/boot/dts/qcom/sm8350.dtsi | 75 +- + arch/arm64/boot/dts/qcom/sm8450-hdk.dts | 4 +- + arch/arm64/boot/dts/qcom/sm8450.dtsi | 79 +- + arch/arm64/boot/dts/qcom/sm8550-hdk.dts | 1293 ++++++++++++++++++++ + arch/arm64/boot/dts/qcom/sm8550.dtsi | 107 +- + arch/arm64/boot/dts/qcom/sm8650-mtp.dts | 2 +- + arch/arm64/boot/dts/qcom/sm8650-qrd.dts | 2 +- + arch/arm64/boot/dts/qcom/sm8650.dtsi | 36 +- + arch/arm64/boot/dts/qcom/x1e80100.dtsi | 10 +- + drivers/clk/qcom/gcc-ipq6018.c | 17 + + drivers/clk/qcom/gcc-sdm845.c | 1 + + drivers/clk/qcom/gcc-sm8150.c | 352 +++--- + drivers/soc/qcom/Makefile | 1 + + drivers/soc/qcom/qcom_aoss.c | 103 +- + drivers/soc/qcom/smem.c | 11 - + drivers/soc/qcom/smp2p.c | 6 +- + drivers/soc/qcom/socinfo.c | 4 +- + drivers/soc/qcom/trace-aoss.h | 48 + + include/dt-bindings/arm/qcom,ids.h | 2 + + include/dt-bindings/clock/qcom,gcc-sm8150.h | 3 + + include/soc/qcom/qcom-spmi-pmic.h | 2 +- + 64 files changed, 2714 insertions(+), 538 deletions(-) + rename arch/arm64/boot/dts/qcom/{pm2250.dtsi => pm4125.dtsi} (91%) + create mode 100644 arch/arm64/boot/dts/qcom/sm8550-hdk.dts + create mode 100644 drivers/soc/qcom/trace-aoss.h +Merging renesas/next (6fc5bb9da080 Merge branches 'renesas-arm-defconfig-for-v6.9', 'renesas-drivers-for-v6.9', 'renesas-dt-bindings-for-v6.9' and 'renesas-dts-for-v6.9' into renesas-next) +$ git merge -m Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/geert/renesas-devel.git renesas/next +Merge made by the 'ort' strategy. + .../bindings/clock/renesas,cpg-mssr.yaml | 1 + + .../bindings/power/renesas,rcar-sysc.yaml | 1 + + .../devicetree/bindings/reset/renesas,rst.yaml | 1 + + .../devicetree/bindings/soc/renesas/renesas.yaml | 13 + + arch/arm/boot/dts/renesas/r8a73a4-ape6evm.dts | 12 + + arch/arm/boot/dts/renesas/r8a73a4.dtsi | 23 +- + arch/arm/configs/multi_v7_defconfig | 1 - + arch/arm/configs/shmobile_defconfig | 2 - + arch/arm64/boot/dts/renesas/Makefile | 5 + + .../boot/dts/renesas/r8a779g0-white-hawk-cpu.dts | 13 + + .../boot/dts/renesas/r8a779g0-white-hawk-cpu.dtsi | 368 +------------------- + .../arm64/boot/dts/renesas/r8a779g0-white-hawk.dts | 58 +--- + arch/arm64/boot/dts/renesas/r8a779g0.dtsi | 84 ++--- + .../dts/renesas/r8a779g2-white-hawk-single.dts | 26 ++ + arch/arm64/boot/dts/renesas/r8a779g2.dtsi | 12 + + .../boot/dts/renesas/r8a779h0-gray-hawk-single.dts | 52 +++ + arch/arm64/boot/dts/renesas/r8a779h0.dtsi | 121 +++++++ + arch/arm64/boot/dts/renesas/r9a07g043u.dtsi | 69 ++++ + arch/arm64/boot/dts/renesas/r9a08g045.dtsi | 14 + + arch/arm64/boot/dts/renesas/rzg3s-smarc-som.dtsi | 5 + + arch/arm64/boot/dts/renesas/rzg3s-smarc.dtsi | 53 +++ + arch/arm64/boot/dts/renesas/ulcb-kf.dtsi | 42 +-- + arch/arm64/boot/dts/renesas/white-hawk-common.dtsi | 65 ++++ + .../boot/dts/renesas/white-hawk-cpu-common.dtsi | 375 +++++++++++++++++++++ + ...e-hawk-csi-dsi.dtsi => white-hawk-csi-dsi.dtsi} | 2 +- + ...hawk-ethernet.dtsi => white-hawk-ethernet.dtsi} | 2 +- + arch/arm64/configs/defconfig | 1 + + drivers/soc/renesas/Kconfig | 17 +- + drivers/soc/renesas/rcar-rst.c | 1 + + drivers/soc/renesas/renesas-soc.c | 8 + + .../dt-bindings/clock/renesas,r8a779h0-cpg-mssr.h | 96 ++++++ + include/dt-bindings/power/renesas,r8a779h0-sysc.h | 49 +++ + 32 files changed, 1088 insertions(+), 504 deletions(-) + create mode 100644 arch/arm64/boot/dts/renesas/r8a779g0-white-hawk-cpu.dts + create mode 100644 arch/arm64/boot/dts/renesas/r8a779g2-white-hawk-single.dts + create mode 100644 arch/arm64/boot/dts/renesas/r8a779g2.dtsi + create mode 100644 arch/arm64/boot/dts/renesas/r8a779h0-gray-hawk-single.dts + create mode 100644 arch/arm64/boot/dts/renesas/r8a779h0.dtsi + create mode 100644 arch/arm64/boot/dts/renesas/white-hawk-common.dtsi + create mode 100644 arch/arm64/boot/dts/renesas/white-hawk-cpu-common.dtsi + rename arch/arm64/boot/dts/renesas/{r8a779g0-white-hawk-csi-dsi.dtsi => white-hawk-csi-dsi.dtsi} (97%) + rename arch/arm64/boot/dts/renesas/{r8a779g0-white-hawk-ethernet.dtsi => white-hawk-ethernet.dtsi} (76%) + create mode 100644 include/dt-bindings/clock/renesas,r8a779h0-cpg-mssr.h + create mode 100644 include/dt-bindings/power/renesas,r8a779h0-sysc.h +Merging reset/reset/next (c3c46acd5be9 dt-bindings: reset: hisilicon,hi3660-reset: Drop providers and consumers from example) +$ git merge -m Merge branch 'reset/next' of https://git.pengutronix.de/git/pza/linux reset/reset/next +Already up to date. +Merging rockchip/for-next (a3c323226362 Merge branch 'v6.9-clk/next' into for-next) +$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/mmind/linux-rockchip.git rockchip/for-next +Merge made by the 'ort' strategy. + .../devicetree/bindings/arm/rockchip.yaml | 38 +- + .../devicetree/bindings/soc/rockchip/grf.yaml | 1 + + arch/arm/boot/dts/rockchip/rk3128-xpi-3128.dts | 29 + + arch/arm/boot/dts/rockchip/rk3128.dtsi | 60 ++ + arch/arm64/boot/dts/rockchip/Makefile | 5 + + arch/arm64/boot/dts/rockchip/px30-ringneck.dtsi | 6 + + .../boot/dts/rockchip/rk3399-kobol-helios64.dts | 3 - + arch/arm64/boot/dts/rockchip/rk3399-rock-pi-4a.dts | 2 +- + arch/arm64/boot/dts/rockchip/rk3399-rock-pi-4b.dts | 2 +- + arch/arm64/boot/dts/rockchip/rk3399-rock-pi-4c.dts | 2 +- + arch/arm64/boot/dts/rockchip/rk3399.dtsi | 70 +- + .../boot/dts/rockchip/rk3566-anbernic-rg-arc-d.dts | 42 ++ + .../boot/dts/rockchip/rk3566-anbernic-rg-arc-s.dts | 19 + + .../boot/dts/rockchip/rk3566-anbernic-rg-arc.dtsi | 237 +++++++ + .../boot/dts/rockchip/rk3566-anbernic-rg353x.dtsi | 74 ++ + .../boot/dts/rockchip/rk3566-anbernic-rg503.dts | 74 ++ + .../boot/dts/rockchip/rk3566-anbernic-rgxx3.dtsi | 74 -- + .../dts/rockchip/rk3588-edgeble-neu6a-common.dtsi | 466 +++++++++++++ + .../boot/dts/rockchip/rk3588-edgeble-neu6a-io.dts | 10 +- + .../boot/dts/rockchip/rk3588-edgeble-neu6a-io.dtsi | 232 +++++++ + .../dts/rockchip/rk3588-edgeble-neu6a-wifi.dtso | 56 ++ + .../boot/dts/rockchip/rk3588-edgeble-neu6a.dtsi | 25 +- + .../boot/dts/rockchip/rk3588-edgeble-neu6b-io.dts | 76 +- + .../boot/dts/rockchip/rk3588-edgeble-neu6b.dtsi | 383 +---------- + arch/arm64/boot/dts/rockchip/rk3588-evb1-v10.dts | 1 + + arch/arm64/boot/dts/rockchip/rk3588-nanopc-t6.dts | 31 +- + arch/arm64/boot/dts/rockchip/rk3588-rock-5b.dts | 7 + + .../arm64/boot/dts/rockchip/rk3588s-nanopi-r6c.dts | 14 + + .../arm64/boot/dts/rockchip/rk3588s-nanopi-r6s.dts | 764 +++++++++++++++++++++ + drivers/clk/rockchip/clk-rk3568.c | 1 + + 30 files changed, 2211 insertions(+), 593 deletions(-) + create mode 100644 arch/arm64/boot/dts/rockchip/rk3566-anbernic-rg-arc-d.dts + create mode 100644 arch/arm64/boot/dts/rockchip/rk3566-anbernic-rg-arc-s.dts + create mode 100644 arch/arm64/boot/dts/rockchip/rk3566-anbernic-rg-arc.dtsi + create mode 100644 arch/arm64/boot/dts/rockchip/rk3588-edgeble-neu6a-common.dtsi + create mode 100644 arch/arm64/boot/dts/rockchip/rk3588-edgeble-neu6a-io.dtsi + create mode 100644 arch/arm64/boot/dts/rockchip/rk3588-edgeble-neu6a-wifi.dtso + create mode 100644 arch/arm64/boot/dts/rockchip/rk3588s-nanopi-r6c.dts + create mode 100644 arch/arm64/boot/dts/rockchip/rk3588s-nanopi-r6s.dts +Merging samsung-krzk/for-next (819ce8ab3d99 Merge branch 'next/drivers' into for-next) +$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/krzk/linux.git samsung-krzk/for-next +Auto-merging arch/arm/configs/multi_v7_defconfig +Merge made by the 'ort' strategy. + .../bindings/clock/google,gs101-clock.yaml | 29 +- + .../devicetree/bindings/clock/tesla,fsd-clock.yaml | 2 +- + .../devicetree/bindings/i2c/i2c-exynos5.yaml | 1 + + .../soc/samsung/samsung,exynos-sysreg.yaml | 1 + + arch/arm/boot/dts/samsung/exynos4412-p4note.dtsi | 51 ++ + arch/arm/boot/dts/samsung/exynos5420-peach-pit.dts | 1 + + .../dts/samsung/exynos5422-odroidxu3-common.dtsi | 16 +- + arch/arm/boot/dts/samsung/exynos5800-peach-pi.dts | 1 + + arch/arm/configs/exynos_defconfig | 3 + + arch/arm/configs/multi_v7_defconfig | 3 + + arch/arm/mach-s5pv210/pm.c | 2 +- + arch/arm64/boot/dts/exynos/google/gs101-oriole.dts | 14 + + arch/arm64/boot/dts/exynos/google/gs101.dtsi | 75 ++- + drivers/clk/samsung/clk-exynos850.c | 10 +- + drivers/clk/samsung/clk-gs101.c | 595 ++++++++++++++++++++- + include/dt-bindings/clock/exynos850.h | 2 + + include/dt-bindings/clock/google,gs101.h | 81 +++ + 17 files changed, 855 insertions(+), 32 deletions(-) +Merging scmi/for-linux-next (99f798bdfb75 Merge tags 'scmi-fixes-6.8' and 'ffa-fixes-6.8' of git://git.kernel.org/pub/scm/linux/kernel/git/sudeep.holla/linux into for-linux-next) +$ git merge -m Merge branch 'for-linux-next' of git://git.kernel.org/pub/scm/linux/kernel/git/sudeep.holla/linux.git scmi/for-linux-next +Merge made by the 'ort' strategy. +Merging sophgo/for-next (41bccc98fb79 Linux 6.8-rc2) +$ git merge -m Merge branch 'for-next' of https://github.com/sophgo/linux.git sophgo/for-next +Already up to date. +Merging stm32/stm32-next (bda732fda193 ARM: dts: stm32: fix DSI peripheral clock on stm32mp15 boards) +$ git merge -m Merge branch 'stm32-next' of git://git.kernel.org/pub/scm/linux/kernel/git/atorgue/stm32.git stm32/stm32-next +Merge made by the 'ort' strategy. + arch/arm/boot/dts/st/stm32mp157.dtsi | 2 +- + arch/arm/boot/dts/st/stm32mp157a-dk1-scmi.dts | 2 +- + arch/arm/boot/dts/st/stm32mp157c-dk2-scmi.dts | 2 +- + arch/arm/boot/dts/st/stm32mp157c-ed1-scmi.dts | 2 +- + arch/arm/boot/dts/st/stm32mp157c-ev1-scmi.dts | 2 +- + arch/arm/boot/dts/st/stm32mp157c-lxa-tac-gen2.dts | 2 +- + arch/arm/boot/dts/st/stm32mp15xc-lxa-tac.dtsi | 2 +- + 7 files changed, 7 insertions(+), 7 deletions(-) +Merging sunxi/sunxi/for-next (38ed19495066 Merge branch 'sunxi/dt-for-6.9' into sunxi/for-next) +$ git merge -m Merge branch 'sunxi/for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/sunxi/linux.git sunxi/sunxi/for-next +Merge made by the 'ort' strategy. + .../sram/allwinner,sun4i-a10-system-control.yaml | 2 +- + .../boot/dts/allwinner/sun50i-h6-beelink-gs1.dts | 2 + + arch/arm64/boot/dts/allwinner/sun50i-h6-tanix.dtsi | 2 + + arch/arm64/boot/dts/allwinner/sun50i-h6.dtsi | 7 ++- + arch/arm64/boot/dts/allwinner/sun50i-h616.dtsi | 61 ++++++++++++++++++++++ + drivers/clk/sunxi/clk-a20-gmac.c | 21 ++++---- + drivers/clk/sunxi/clk-sun9i-cpus.c | 7 +-- + drivers/clk/sunxi/clk-usb.c | 9 ++-- + 8 files changed, 90 insertions(+), 21 deletions(-) +Merging tee/next (84ec4fd88831 Merge branch 'tee_iov_iter_for_v6.8' into next) +$ git merge -m Merge branch 'next' of https://git.linaro.org/people/jens.wiklander/linux-tee.git tee/next +Merge made by the 'ort' strategy. +Merging tegra/for-next (5e6333ef8ea5 Merge branch for-6.8/arm/dt into for-next) +$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/tegra/linux.git tegra/for-next +Auto-merging drivers/soc/tegra/fuse/fuse-tegra30.c +Auto-merging include/linux/string.h +Auto-merging mm/util.c +Merge made by the 'ort' strategy. + arch/arm/boot/dts/nvidia/tegra30-ouya.dts | 4 +- + drivers/soc/tegra/Kconfig | 5 ++ + drivers/soc/tegra/fuse/fuse-tegra.c | 115 ++++++++++++++++++++++-------- + drivers/soc/tegra/fuse/fuse-tegra30.c | 20 ++++++ + drivers/soc/tegra/fuse/fuse.h | 8 ++- + drivers/soc/tegra/fuse/tegra-apbmisc.c | 110 +++++++++++++++++++++++----- + drivers/soc/tegra/pmc.c | 24 ------- + include/linux/string.h | 1 + + include/soc/tegra/fuse.h | 1 + + include/soc/tegra/pmc.h | 18 ----- + mm/util.c | 17 +++++ + 11 files changed, 233 insertions(+), 90 deletions(-) +Merging ti/ti-next (6613476e225e Linux 6.8-rc1) +$ git merge -m Merge branch 'ti-next' of git://git.kernel.org/pub/scm/linux/kernel/git/ti/linux.git ti/ti-next +Already up to date. +Merging xilinx/for-next (0ee74e0d7b97 Merge remote-tracking branch 'git/zynqmp/dt' into for-next) +$ git merge -m Merge branch 'for-next' of git://github.com/Xilinx/linux-xlnx.git xilinx/for-next +Auto-merging MAINTAINERS +Merge made by the 'ort' strategy. + .../firmware/xilinx/xlnx,zynqmp-firmware.yaml | 78 +++++++++++++++++--- + .../devicetree/bindings/fpga/xlnx,versal-fpga.yaml | 2 +- + .../devicetree/bindings/soc/xilinx/xilinx.yaml | 70 +++++++++++++++--- + MAINTAINERS | 2 +- + arch/arm/mach-zynq/slcr.c | 5 +- + arch/arm64/boot/dts/xilinx/zynqmp-clk-ccf.dtsi | 16 +++- + .../boot/dts/xilinx/zynqmp-sck-kv-g-revA.dtso | 36 ++++++++- + .../boot/dts/xilinx/zynqmp-sck-kv-g-revB.dtso | 37 +++++++++- + .../boot/dts/xilinx/zynqmp-zc1751-xm015-dc1.dts | 2 +- + .../boot/dts/xilinx/zynqmp-zc1751-xm016-dc2.dts | 2 +- + .../boot/dts/xilinx/zynqmp-zc1751-xm019-dc5.dts | 4 +- + arch/arm64/boot/dts/xilinx/zynqmp-zcu100-revC.dts | 2 +- + arch/arm64/boot/dts/xilinx/zynqmp-zcu102-revA.dts | 6 +- + arch/arm64/boot/dts/xilinx/zynqmp-zcu104-revA.dts | 2 +- + arch/arm64/boot/dts/xilinx/zynqmp-zcu104-revC.dts | 2 +- + arch/arm64/boot/dts/xilinx/zynqmp-zcu106-revA.dts | 6 +- + arch/arm64/boot/dts/xilinx/zynqmp-zcu111-revA.dts | 4 +- + arch/arm64/boot/dts/xilinx/zynqmp-zcu1275-revA.dts | 2 +- + arch/arm64/boot/dts/xilinx/zynqmp.dtsi | 85 +++++++++++++--------- + 19 files changed, 280 insertions(+), 83 deletions(-) +Merging clk/clk-next (efe5a1b888ab Merge branch 'clk-fixes' into clk-next) +$ git merge -m Merge branch 'clk-next' of git://git.kernel.org/pub/scm/linux/kernel/git/clk/linux.git clk/clk-next +Merge made by the 'ort' strategy. +Merging clk-imx/for-next (f52f00069888 clk: imx: pll14xx: change naming of fvco to fout) +$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/abelvesa/linux.git clk-imx/for-next +Already up to date. +Merging clk-renesas/renesas-clk (096311157d2a clk: renesas: r8a779g0: Fix PCIe clock name) +$ git merge -m Merge branch 'renesas-clk' of git://git.kernel.org/pub/scm/linux/kernel/git/geert/renesas-drivers.git clk-renesas/renesas-clk +Merge made by the 'ort' strategy. + drivers/clk/renesas/Kconfig | 5 + + drivers/clk/renesas/Makefile | 1 + + drivers/clk/renesas/clk-mstp.c | 16 +-- + drivers/clk/renesas/r8a779g0-cpg-mssr.c | 2 +- + drivers/clk/renesas/r8a779h0-cpg-mssr.c | 241 ++++++++++++++++++++++++++++++++ + drivers/clk/renesas/r9a07g043-cpg.c | 31 ++++ + drivers/clk/renesas/r9a08g045-cpg.c | 3 + + drivers/clk/renesas/rcar-gen4-cpg.c | 10 +- + drivers/clk/renesas/renesas-cpg-mssr.c | 117 +++++++++++++++- + drivers/clk/renesas/renesas-cpg-mssr.h | 1 + + drivers/of/base.c | 123 +++++++++++----- + include/linux/of.h | 11 ++ + 12 files changed, 502 insertions(+), 59 deletions(-) + create mode 100644 drivers/clk/renesas/r8a779h0-cpg-mssr.c +Merging csky/linux-next (2c40c1c6adab Merge tag 'usb-6.7-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/usb) +$ git merge -m Merge branch 'linux-next' of git://github.com/c-sky/csky-linux.git csky/linux-next +Already up to date. +Merging loongarch/loongarch-next (48ef9e87b407 LoongArch: KVM: Add returns to SIMD stubs) +$ git merge -m Merge branch 'loongarch-next' of git://git.kernel.org/pub/scm/linux/kernel/git/chenhuacai/linux-loongson.git loongarch/loongarch-next +Already up to date. +Merging m68k/for-next (6b9c045b0602 m68k: defconfig: Update defconfigs for v6.7-rc1) +$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/geert/linux-m68k.git m68k/for-next +Already up to date. +Merging m68knommu/for-next (41bccc98fb79 Linux 6.8-rc2) +$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/gerg/m68knommu.git m68knommu/for-next +Already up to date. +Merging microblaze/next (6613476e225e Linux 6.8-rc1) +$ git merge -m Merge branch 'next' of git://git.monstr.eu/linux-2.6-microblaze.git microblaze/next +Already up to date. +Merging mips/mips-next (6613476e225e Linux 6.8-rc1) +$ git merge -m Merge branch 'mips-next' of git://git.kernel.org/pub/scm/linux/kernel/git/mips/linux.git mips/mips-next +Already up to date. +Merging openrisc/for-next (c289330331eb openrisc: Remove kernel-doc marker from ioremap comment) +$ git merge -m Merge branch 'for-next' of git://github.com/openrisc/linux.git openrisc/for-next +Already up to date. +Merging parisc-hd/for-next (913b9d443a01 parisc: BTLB: Fix crash when setting up BTLB at CPU bringup) +$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/deller/parisc-linux.git parisc-hd/for-next +Merge made by the 'ort' strategy. + arch/parisc/Kconfig | 1 - + arch/parisc/include/asm/assembly.h | 1 + + arch/parisc/include/asm/extable.h | 64 +++++++++++++++++++++++++++++++++ + arch/parisc/include/asm/special_insns.h | 6 ++-- + arch/parisc/include/asm/uaccess.h | 48 ++++--------------------- + arch/parisc/kernel/cache.c | 10 ++++-- + arch/parisc/kernel/drivers.c | 5 ++- + arch/parisc/kernel/unaligned.c | 44 +++++++++++------------ + arch/parisc/kernel/vmlinux.lds.S | 2 +- + arch/parisc/mm/fault.c | 11 ++++-- + 10 files changed, 118 insertions(+), 74 deletions(-) + create mode 100644 arch/parisc/include/asm/extable.h +Merging powerpc/next (44a1aad2fe6c Merge branch 'topic/ppc-kvm' into next) +$ git merge -m Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git powerpc/next +Already up to date. +Merging soc-fsl/next (fb9c384625dd bus: fsl-mc: fsl-mc-allocator: Drop a write-only variable) +$ git merge -m Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/leo/linux.git soc-fsl/next +Already up to date. +Merging risc-v/for-next (cb4ede926134 riscv: Avoid code duplication with generic bitops implementation) +$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/riscv/linux.git risc-v/for-next +Auto-merging arch/riscv/Kconfig +Auto-merging arch/riscv/mm/init.c +Auto-merging include/linux/mm.h +Auto-merging mm/mmap.c +Merge made by the 'ort' strategy. + arch/riscv/Kbuild | 1 + + arch/riscv/Kconfig | 18 +- + arch/riscv/Makefile | 5 + + arch/riscv/crypto/Kconfig | 93 ++++ + arch/riscv/crypto/Makefile | 23 + + arch/riscv/crypto/aes-macros.S | 156 ++++++ + arch/riscv/crypto/aes-riscv64-glue.c | 550 +++++++++++++++++++++ + arch/riscv/crypto/aes-riscv64-zvkned-zvbb-zvkg.S | 312 ++++++++++++ + arch/riscv/crypto/aes-riscv64-zvkned-zvkb.S | 146 ++++++ + arch/riscv/crypto/aes-riscv64-zvkned.S | 180 +++++++ + arch/riscv/crypto/chacha-riscv64-glue.c | 101 ++++ + arch/riscv/crypto/chacha-riscv64-zvkb.S | 294 +++++++++++ + arch/riscv/crypto/ghash-riscv64-glue.c | 168 +++++++ + arch/riscv/crypto/ghash-riscv64-zvkg.S | 72 +++ + arch/riscv/crypto/sha256-riscv64-glue.c | 137 +++++ + .../crypto/sha256-riscv64-zvknha_or_zvknhb-zvkb.S | 225 +++++++++ + arch/riscv/crypto/sha512-riscv64-glue.c | 133 +++++ + arch/riscv/crypto/sha512-riscv64-zvknhb-zvkb.S | 203 ++++++++ + arch/riscv/crypto/sm3-riscv64-glue.c | 112 +++++ + arch/riscv/crypto/sm3-riscv64-zvksh-zvkb.S | 123 +++++ + arch/riscv/crypto/sm4-riscv64-glue.c | 107 ++++ + arch/riscv/crypto/sm4-riscv64-zvksed-zvkb.S | 117 +++++ + arch/riscv/include/asm/asm.h | 10 + + arch/riscv/include/asm/bitops.h | 138 +----- + arch/riscv/include/asm/pgalloc.h | 53 +- + arch/riscv/include/asm/pgtable.h | 6 + + arch/riscv/include/asm/tlb.h | 18 + + arch/riscv/include/asm/vector.h | 11 + + arch/riscv/kernel/entry.S | 3 + + arch/riscv/kernel/pi/Makefile | 3 + + arch/riscv/kernel/smpboot.c | 1 - + arch/riscv/kernel/traps.c | 17 +- + arch/riscv/lib/uaccess_vector.S | 1 - + arch/riscv/mm/init.c | 6 + + crypto/Kconfig | 3 + + drivers/clocksource/timer-clint.c | 2 +- + drivers/clocksource/timer-riscv.c | 2 +- + include/asm-generic/bitops/__ffs.h | 8 +- + include/asm-generic/bitops/__fls.h | 8 +- + include/asm-generic/bitops/ffs.h | 8 +- + include/asm-generic/bitops/fls.h | 8 +- + include/linux/mm.h | 2 +- + mm/mmap.c | 2 +- + 43 files changed, 3445 insertions(+), 141 deletions(-) + create mode 100644 arch/riscv/crypto/Kconfig + create mode 100644 arch/riscv/crypto/Makefile + create mode 100644 arch/riscv/crypto/aes-macros.S + create mode 100644 arch/riscv/crypto/aes-riscv64-glue.c + create mode 100644 arch/riscv/crypto/aes-riscv64-zvkned-zvbb-zvkg.S + create mode 100644 arch/riscv/crypto/aes-riscv64-zvkned-zvkb.S + create mode 100644 arch/riscv/crypto/aes-riscv64-zvkned.S + create mode 100644 arch/riscv/crypto/chacha-riscv64-glue.c + create mode 100644 arch/riscv/crypto/chacha-riscv64-zvkb.S + create mode 100644 arch/riscv/crypto/ghash-riscv64-glue.c + create mode 100644 arch/riscv/crypto/ghash-riscv64-zvkg.S + create mode 100644 arch/riscv/crypto/sha256-riscv64-glue.c + create mode 100644 arch/riscv/crypto/sha256-riscv64-zvknha_or_zvknhb-zvkb.S + create mode 100644 arch/riscv/crypto/sha512-riscv64-glue.c + create mode 100644 arch/riscv/crypto/sha512-riscv64-zvknhb-zvkb.S + create mode 100644 arch/riscv/crypto/sm3-riscv64-glue.c + create mode 100644 arch/riscv/crypto/sm3-riscv64-zvksh-zvkb.S + create mode 100644 arch/riscv/crypto/sm4-riscv64-glue.c + create mode 100644 arch/riscv/crypto/sm4-riscv64-zvksed-zvkb.S +Merging riscv-dt/riscv-dt-for-next (2db68ddbf33a riscv: dts: starfive: beaglev-starlight: Setup phy reset gpio) +$ git merge -m Merge branch 'riscv-dt-for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/conor/linux.git riscv-dt/riscv-dt-for-next +Merge made by the 'ort' strategy. + .../devicetree/bindings/pwm/opencores,pwm.yaml | 55 +++++++++++ + .../boot/dts/starfive/jh7100-beaglev-starlight.dts | 11 +++ + arch/riscv/boot/dts/starfive/jh7100-common.dtsi | 108 +++++++++++++++++++++ + .../dts/starfive/jh7100-starfive-visionfive-v1.dts | 22 ++++- + arch/riscv/boot/dts/starfive/jh7100.dtsi | 45 +++++++++ + .../dts/starfive/jh7110-starfive-visionfive-2.dtsi | 22 +++++ + arch/riscv/boot/dts/starfive/jh7110.dtsi | 9 ++ + 7 files changed, 271 insertions(+), 1 deletion(-) + create mode 100644 Documentation/devicetree/bindings/pwm/opencores,pwm.yaml +Merging riscv-soc/riscv-soc-for-next (6613476e225e Linux 6.8-rc1) +$ git merge -m Merge branch 'riscv-soc-for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/conor/linux.git riscv-soc/riscv-soc-for-next +Already up to date. +Merging s390/for-next (8eb3db95a8c8 Merge branch 'features' into for-next) +$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/s390/linux.git s390/for-next +Merge made by the 'ort' strategy. +Merging sh/for-next (6613476e225e Linux 6.8-rc1) +$ git merge -m Merge branch 'for-next' of git:git.kernel.org/pub/scm/linux/kernel/git/glaubitz/sh-linux.git sh/for-next +Already up to date. +Merging uml/next (83aec96c631e um: Mark 32bit syscall helpers as clobbering memory) +$ git merge -m Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/uml/linux.git uml/next +Already up to date. +Merging xtensa/xtensa-for-next (a03cd7602a09 xtensa: don't produce FDPIC output with fdpic toolchain) +$ git merge -m Merge branch 'xtensa-for-next' of git://github.com/jcmvbkbc/linux-xtensa.git xtensa/xtensa-for-next +Already up to date. +Merging bcachefs/for-next (6bb3f7f4c3f4 bcachefs: unlock parent dir if entry is not found in subvolume deletion) +$ git merge -m Merge branch 'for-next' of https://evilpiepirate.org/git/bcachefs.git bcachefs/for-next +Merge made by the 'ort' strategy. + fs/bcachefs/fs-ioctl.c | 4 ++-- + fs/bcachefs/mean_and_variance.h | 2 +- + 2 files changed, 3 insertions(+), 3 deletions(-) +Merging pidfd/for-next (a901a3568fd2 Merge tag 'iomap-6.5-merge-1' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux) +$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux.git pidfd/for-next +Already up to date. +Merging fscrypt/for-next (c919330dd578 f2fs: fix double free of f2fs_sb_info) +$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/fs/fscrypt/linux.git fscrypt/for-next +Already up to date. +Merging afs/afs-next (abcbd3bfbbfe afs: trace: Log afs_make_call(), including server address) +$ git merge -m Merge branch 'afs-next' of git://git.kernel.org/pub/scm/linux/kernel/git/dhowells/linux-fs.git afs/afs-next +Already up to date. +Merging btrfs/for-next (932ab07c383e Merge branch 'for-next-next-v6.8-20240108' into for-next-20240108) +$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux.git btrfs/for-next +Auto-merging fs/btrfs/extent-tree.c +Auto-merging fs/btrfs/extent_io.c +CONFLICT (content): Merge conflict in fs/btrfs/extent_io.c +Auto-merging fs/btrfs/inode.c +Auto-merging fs/btrfs/ioctl.c +Auto-merging fs/btrfs/zoned.c +Resolved 'fs/btrfs/extent_io.c' using previous resolution. +Automatic merge failed; fix conflicts and then commit the result. +$ git commit --no-edit -v -a +[master 3b458c21387e] Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux.git +$ git diff -M --stat --summary HEAD^.. + fs/btrfs/accessors.c | 12 +++++----- + fs/btrfs/btrfs_inode.h | 3 +-- + fs/btrfs/ctree.c | 2 +- + fs/btrfs/disk-io.c | 2 +- + fs/btrfs/extent_io.c | 57 +++++++++++++++++++------------------------- + fs/btrfs/extent_io.h | 16 ++++++++++--- + fs/btrfs/file.c | 11 ++++----- + fs/btrfs/inode.c | 16 +++++-------- + fs/btrfs/tests/inode-tests.c | 40 +++++++++++++++---------------- + 9 files changed, 78 insertions(+), 81 deletions(-) +Merging ceph/master (ded080c86b3f rbd: don't move requests to the running list on errors) +$ git merge -m Merge branch 'master' of git://github.com/ceph/ceph-client.git ceph/master +Already up to date. +Merging cifs/for-next (2417900a8dce smb: client: parse uid, gid, mode and dev from WSL reparse points) +$ git merge -m Merge branch 'for-next' of git://git.samba.org/sfrench/cifs-2.6.git cifs/for-next +Merge made by the 'ort' strategy. + fs/smb/client/Makefile | 2 +- + fs/smb/client/cifsglob.h | 41 ++-- + fs/smb/client/cifsproto.h | 4 - + fs/smb/client/connect.c | 2 + + fs/smb/client/fs_context.c | 35 +++ + fs/smb/client/fs_context.h | 9 + + fs/smb/client/inode.c | 84 +------- + fs/smb/client/readdir.c | 20 +- + fs/smb/client/reparse.c | 526 +++++++++++++++++++++++++++++++++++++++++++++ + fs/smb/client/reparse.h | 113 ++++++++++ + fs/smb/client/smb2glob.h | 3 +- + fs/smb/client/smb2inode.c | 399 ++++++++++++++++++++++++---------- + fs/smb/client/smb2ops.c | 250 +-------------------- + fs/smb/client/smb2pdu.c | 29 ++- + fs/smb/client/smb2pdu.h | 36 +++- + fs/smb/client/smb2proto.h | 9 +- + fs/smb/client/trace.h | 2 + + fs/smb/client/transport.c | 4 +- + fs/smb/common/smbfsctl.h | 6 - + 19 files changed, 1081 insertions(+), 493 deletions(-) + create mode 100644 fs/smb/client/reparse.c + create mode 100644 fs/smb/client/reparse.h +Merging configfs/for-next (4425c1d9b44d configfs: improve item creation performance) +$ git merge -m Merge branch 'for-next' of git://git.infradead.org/users/hch/configfs.git configfs/for-next +Auto-merging fs/configfs/inode.c +Merge made by the 'ort' strategy. + fs/configfs/configfs_internal.h | 4 ++-- + fs/configfs/dir.c | 42 +++++++++++++++++++++++++++++++---------- + fs/configfs/inode.c | 24 ----------------------- + 3 files changed, 34 insertions(+), 36 deletions(-) +Merging ecryptfs/next (a3d78fe3e1ae fs: ecryptfs: comment typo fix) +$ git merge -m Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/tyhicks/ecryptfs.git ecryptfs/next +Auto-merging fs/ecryptfs/crypto.c +Auto-merging fs/ecryptfs/read_write.c +Merge made by the 'ort' strategy. + fs/ecryptfs/crypto.c | 2 +- + fs/ecryptfs/keystore.c | 2 +- + 2 files changed, 2 insertions(+), 2 deletions(-) +Merging erofs/dev (aa12a790d31b erofs: make erofs_{err,info}() support NULL sb parameter) +$ git merge -m Merge branch 'dev' of git://git.kernel.org/pub/scm/linux/kernel/git/xiang/erofs.git erofs/dev +Already up to date. +Merging exfat/dev (8b29fa18400c exfat: ratelimit error msg in exfat_file_mmap()) +$ git merge -m Merge branch 'dev' of git://git.kernel.org/pub/scm/linux/kernel/git/linkinjeon/exfat.git exfat/dev +Merge made by the 'ort' strategy. + fs/exfat/exfat_fs.h | 5 +++++ + fs/exfat/file.c | 6 +++++- + fs/exfat/inode.c | 7 +++---- + 3 files changed, 13 insertions(+), 5 deletions(-) +Merging exportfs/exportfs-next (42c3732fa807 fs: Create a generic is_dot_dotdot() utility) +$ git merge -m Merge branch 'exportfs-next' of git://git.kernel.org/pub/scm/linux/kernel/git/cel/linux exportfs/exportfs-next +Auto-merging fs/ecryptfs/crypto.c +Auto-merging fs/f2fs/f2fs.h +Auto-merging fs/namei.c +Auto-merging include/linux/fs.h +Merge made by the 'ort' strategy. + fs/crypto/fname.c | 8 +------- + fs/ecryptfs/crypto.c | 10 ---------- + fs/exportfs/expfs.c | 2 +- + fs/f2fs/f2fs.h | 11 ----------- + fs/namei.c | 6 ++---- + include/linux/fs.h | 11 +++++++++++ + 6 files changed, 15 insertions(+), 33 deletions(-) +Merging ext3/for_next (cd04011c5859 Merge fsnotify optimization & cleanup from Amir.) +$ git merge -m Merge branch 'for_next' of git://git.kernel.org/pub/scm/linux/kernel/git/jack/linux-fs.git ext3/for_next +Merge made by the 'ort' strategy. + fs/ext2/balloc.c | 2 +- + fs/ext2/inode.c | 2 +- + fs/ext2/xattr.c | 2 +- + fs/notify/fsnotify.c | 28 +++++++++++++++++----------- + fs/ocfs2/quota_global.c | 12 ++++++++++++ + fs/ocfs2/quota_local.c | 3 +++ + fs/quota/dquot.c | 13 +++++-------- + fs/quota/quota_tree.c | 24 ++++++++++++------------ + fs/quota/quota_v1.c | 6 ++++++ + fs/quota/quota_v2.c | 20 +++++++++++++++++++- + fs/udf/dir.c | 2 +- + fs/udf/inode.c | 2 +- + fs/udf/namei.c | 23 +++++++++++++---------- + fs/udf/super.c | 2 +- + include/linux/fsnotify.h | 12 +++++++++--- + 15 files changed, 102 insertions(+), 51 deletions(-) +Merging ext4/dev (68da4c44b994 ext4: fix inconsistent between segment fstrim and full fstrim) +$ git merge -m Merge branch 'dev' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4.git ext4/dev +Already up to date. +Merging f2fs/dev (f31438c16879 f2fs: fix to avoid potential panic during recovery) +$ git merge -m Merge branch 'dev' of git://git.kernel.org/pub/scm/linux/kernel/git/jaegeuk/f2fs.git f2fs/dev +Auto-merging fs/f2fs/f2fs.h +Merge made by the 'ort' strategy. + Documentation/ABI/testing/sysfs-fs-f2fs | 47 ++++++++++---------- + Documentation/filesystems/f2fs.rst | 47 ++++++++++---------- + fs/f2fs/checkpoint.c | 19 ++++++-- + fs/f2fs/compress.c | 45 +++++++++++-------- + fs/f2fs/data.c | 36 ++++++++------- + fs/f2fs/dir.c | 5 +-- + fs/f2fs/f2fs.h | 78 ++++++++++++++++++++++----------- + fs/f2fs/file.c | 51 ++++++++++++++------- + fs/f2fs/namei.c | 11 ++--- + fs/f2fs/node.c | 2 +- + fs/f2fs/recovery.c | 33 +++++++------- + fs/f2fs/segment.c | 4 +- + fs/f2fs/super.c | 57 ++++++++++++++---------- + 13 files changed, 254 insertions(+), 181 deletions(-) +Merging fsverity/for-next (919dc320956e fsverity: skip PKCS#7 parser when keyring is empty) +$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/fs/fsverity/linux.git fsverity/for-next +Already up to date. +Merging fuse/for-next (3f29f1c336c0 fuse: disable FOPEN_PARALLEL_DIRECT_WRITES with FUSE_DIRECT_IO_ALLOW_MMAP) +$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/mszeredi/fuse.git fuse/for-next +Already up to date. +Merging gfs2/for-next (acd2d246f4b2 gfs2: Fix LOOKUP_RCU support in gfs2_drevalidate) +$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/gfs2/linux-gfs2.git gfs2/for-next +Merge made by the 'ort' strategy. + fs/gfs2/aops.c | 4 +- + fs/gfs2/bmap.c | 21 +++++----- + fs/gfs2/dentry.c | 32 ++++++++++----- + fs/gfs2/dir.c | 52 +++++++++++++---------- + fs/gfs2/dir.h | 2 +- + fs/gfs2/file.c | 4 +- + fs/gfs2/glops.c | 2 +- + fs/gfs2/incore.h | 1 - + fs/gfs2/inode.c | 10 ++--- + fs/gfs2/meta_io.c | 114 ++++++++++++++++++++++++++++++++++----------------- + fs/gfs2/meta_io.h | 15 ++++--- + fs/gfs2/ops_fstype.c | 4 +- + fs/gfs2/quota.c | 2 +- + fs/gfs2/recovery.c | 2 +- + fs/gfs2/rgrp.c | 5 ++- + fs/gfs2/super.c | 6 +-- + fs/gfs2/xattr.c | 17 ++++---- + 17 files changed, 176 insertions(+), 117 deletions(-) +Merging jfs/jfs-next (e42e29cc4423 Revert "jfs: fix shift-out-of-bounds in dbJoin") +$ git merge -m Merge branch 'jfs-next' of git://github.com/kleikamp/linux-shaggy.git jfs/jfs-next +Already up to date. +Merging ksmbd/ksmbd-for-next (41bccc98fb79 Linux 6.8-rc2) +$ git merge -m Merge branch 'ksmbd-for-next' of https://github.com/smfrench/smb3-kernel.git ksmbd/ksmbd-for-next +Already up to date. +Merging nfs/linux-next (052d534373b7 Merge tag 'exfat-for-6.8-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/linkinjeon/exfat) +$ git merge -m Merge branch 'linux-next' of git://git.linux-nfs.org/projects/trondmy/nfs-2.6.git nfs/linux-next +Already up to date. +Merging nfs-anna/linux-next (57331a59ac0d NFSv4.1: Use the nfs_client's rpc timeouts for backchannel) +$ git merge -m Merge branch 'linux-next' of git://git.linux-nfs.org/projects/anna/linux-nfs.git nfs-anna/linux-next +Already up to date. +Merging nfsd/nfsd-next (6c1c91f97746 nfsd: Simplify the allocation of slab caches in nfsd_file_cache_init) +$ git merge -m Merge branch 'nfsd-next' of git://git.kernel.org/pub/scm/linux/kernel/git/cel/linux nfsd/nfsd-next +Auto-merging net/sunrpc/svc.c +Merge made by the 'ort' strategy. + fs/lockd/svc.c | 3 - + fs/nfs/callback.c | 3 - + fs/nfsd/blocklayout.c | 4 +- + fs/nfsd/cache.h | 2 - + fs/nfsd/filecache.c | 76 ++-- + fs/nfsd/filecache.h | 1 + + fs/nfsd/netns.h | 29 +- + fs/nfsd/nfs4callback.c | 96 +++-- + fs/nfsd/nfs4layouts.c | 63 ++-- + fs/nfsd/nfs4proc.c | 6 +- + fs/nfsd/nfs4state.c | 642 ++++++++++++++++++++++++---------- + fs/nfsd/nfs4xdr.c | 19 +- + fs/nfsd/nfscache.c | 40 +-- + fs/nfsd/nfsctl.c | 17 +- + fs/nfsd/nfsd.h | 2 + + fs/nfsd/nfsfh.c | 3 +- + fs/nfsd/nfssvc.c | 16 +- + fs/nfsd/pnfs.h | 8 +- + fs/nfsd/state.h | 52 ++- + fs/nfsd/stats.c | 52 ++- + fs/nfsd/stats.h | 70 ++-- + fs/nfsd/trace.h | 194 +++++++++- + fs/nfsd/vfs.c | 48 ++- + fs/nfsd/vfs.h | 2 + + include/linux/sunrpc/svc.h | 5 +- + include/trace/misc/nfs.h | 34 ++ + net/sunrpc/auth_gss/gss_krb5_crypto.c | 14 +- + net/sunrpc/auth_gss/gss_krb5_mech.c | 11 +- + net/sunrpc/auth_gss/gss_rpc_xdr.c | 27 +- + net/sunrpc/stats.c | 2 +- + net/sunrpc/svc.c | 40 ++- + net/sunrpc/xprtsock.c | 9 - + 32 files changed, 1080 insertions(+), 510 deletions(-) +Merging ntfs3/master (622cd3daa8ea fs/ntfs3: Slightly simplify ntfs_inode_printk()) +$ git merge -m Merge branch 'master' of https://github.com/Paragon-Software-Group/linux-ntfs3.git ntfs3/master +Merge made by the 'ort' strategy. + fs/ntfs3/attrib.c | 45 +++++++---- + fs/ntfs3/attrlist.c | 12 +-- + fs/ntfs3/bitmap.c | 4 +- + fs/ntfs3/dir.c | 48 ++++++++--- + fs/ntfs3/file.c | 76 ++++++++++++++---- + fs/ntfs3/frecord.c | 19 +++-- + fs/ntfs3/fslog.c | 228 ++++++++++++++++++++++++---------------------------- + fs/ntfs3/fsntfs.c | 29 ++++++- + fs/ntfs3/index.c | 8 +- + fs/ntfs3/inode.c | 32 ++++++-- + fs/ntfs3/namei.c | 12 +++ + fs/ntfs3/ntfs.h | 4 +- + fs/ntfs3/ntfs_fs.h | 29 +++---- + fs/ntfs3/record.c | 18 ++++- + fs/ntfs3/super.c | 54 ++++++++----- + fs/ntfs3/xattr.c | 6 ++ + 16 files changed, 379 insertions(+), 245 deletions(-) +Merging orangefs/for-next (31720a2b109b orangefs: Fix kmemleak in orangefs_{kernel,client}_debug_init()) +$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/hubcap/linux orangefs/for-next +Already up to date. +Merging overlayfs/overlayfs-next (d17bb4620f90 overlayfs.rst: fix ReST formatting) +$ git merge -m Merge branch 'overlayfs-next' of git://git.kernel.org/pub/scm/linux/kernel/git/overlayfs/vfs.git overlayfs/overlayfs-next +Already up to date. +Merging ubifs/next (adbf4c4954e3 ubi: block: fix memleak in ubiblock_create()) +$ git merge -m Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/rw/ubifs.git ubifs/next +Already up to date. +Merging v9fs/9p-next (ff49bf186757 net: 9p: avoid freeing uninit memory in p9pdu_vreadf) +$ git merge -m Merge branch '9p-next' of git://github.com/martinetd/linux v9fs/9p-next +Already up to date. +Merging v9fs-ericvh/ericvh/for-next (be57855f5050 fs/9p: fix dups even in uncached mode) +$ git merge -m Merge branch 'ericvh/for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/ericvh/v9fs.git v9fs-ericvh/ericvh/for-next +Merge made by the 'ort' strategy. + fs/9p/v9fs.h | 31 ++------ + fs/9p/v9fs_vfs.h | 11 ++- + fs/9p/vfs_dir.c | 4 +- + fs/9p/vfs_inode.c | 150 ++++++-------------------------------- + fs/9p/vfs_inode_dotl.c | 194 +++++++++---------------------------------------- + fs/9p/vfs_super.c | 45 +----------- + 6 files changed, 71 insertions(+), 364 deletions(-) +Merging xfs/for-next (881f78f47255 xfs: remove conditional building of rt geometry validator functions) +$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux.git xfs/for-next +Merge made by the 'ort' strategy. + fs/xfs/libxfs/xfs_attr.c | 6 +++--- + fs/xfs/libxfs/xfs_rtbitmap.c | 14 -------------- + fs/xfs/libxfs/xfs_rtbitmap.h | 16 ---------------- + fs/xfs/libxfs/xfs_sb.c | 14 ++++++++++++++ + fs/xfs/libxfs/xfs_sb.h | 2 ++ + fs/xfs/libxfs/xfs_types.h | 12 ++++++++++++ + fs/xfs/scrub/rtbitmap.c | 1 + + fs/xfs/scrub/rtsummary.c | 1 + + 8 files changed, 33 insertions(+), 33 deletions(-) +Merging zonefs/for-next (8812387d0569 zonefs: set FMODE_CAN_ODIRECT instead of a dummy direct_IO method) +$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/dlemoal/zonefs.git zonefs/for-next +Already up to date. +Merging iomap/iomap-for-next (3ac974796e5d iomap: fix short copy in iomap_write_iter()) +$ git merge -m Merge branch 'iomap-for-next' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux.git iomap/iomap-for-next +Already up to date. +Merging djw-vfs/vfs-for-next (ce85a1e04645 xfs: stabilize fs summary counters for online fsck) +$ git merge -m Merge branch 'vfs-for-next' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux.git djw-vfs/vfs-for-next +Already up to date. +Merging file-locks/locks-next (e0152e7481c6 Merge tag 'riscv-for-linus-6.6-mw1' of git://git.kernel.org/pub/scm/linux/kernel/git/riscv/linux) +$ git merge -m Merge branch 'locks-next' of git://git.kernel.org/pub/scm/linux/kernel/git/jlayton/linux.git file-locks/locks-next +Already up to date. +Merging iversion/iversion-next (e0152e7481c6 Merge tag 'riscv-for-linus-6.6-mw1' of git://git.kernel.org/pub/scm/linux/kernel/git/riscv/linux) +$ git merge -m Merge branch 'iversion-next' of git://git.kernel.org/pub/scm/linux/kernel/git/jlayton/linux.git iversion/iversion-next +Already up to date. +Merging vfs-brauner/vfs.all (de9861b0c277 Merge branch 'vfs.fs' into vfs.all) + 563bd99dc191 ("iov_iter: Avoid wrap-around instrumentation in copy_compat_iovec_from_user()") +$ git merge -m Merge branch 'vfs.all' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs.git vfs-brauner/vfs.all +Auto-merging CREDITS +Auto-merging MAINTAINERS +Auto-merging fs/Kconfig +Auto-merging fs/ntfs3/namei.c +Auto-merging init/initramfs.c +Auto-merging kernel/exit.c +Auto-merging lib/iov_iter.c +Auto-merging mm/filemap.c +Merge made by the 'ort' strategy. + CREDITS | 5 + + Documentation/filesystems/index.rst | 1 - + Documentation/filesystems/ntfs.rst | 466 --- + MAINTAINERS | 10 - + fs/Kconfig | 1 - + fs/Makefile | 1 - + fs/attr.c | 2 +- + fs/backing-file.c | 4 +- + fs/buffer.c | 10 +- + fs/eventfd.c | 14 +- + fs/exec.c | 6 +- + fs/fhandle.c | 2 +- + fs/fs-writeback.c | 25 + + fs/inode.c | 3 +- + fs/netfs/buffered_write.c | 3 + + fs/netfs/direct_write.c | 5 +- + fs/netfs/io.c | 2 + + fs/ntfs/Kconfig | 81 - + fs/ntfs/Makefile | 15 - + fs/ntfs/aops.c | 1744 ----------- + fs/ntfs/aops.h | 88 - + fs/ntfs/attrib.c | 2624 ---------------- + fs/ntfs/attrib.h | 102 - + fs/ntfs/bitmap.c | 179 -- + fs/ntfs/bitmap.h | 104 - + fs/ntfs/collate.c | 110 - + fs/ntfs/collate.h | 36 - + fs/ntfs/compress.c | 950 ------ + fs/ntfs/debug.c | 159 - + fs/ntfs/debug.h | 57 - + fs/ntfs/dir.c | 1540 ---------- + fs/ntfs/dir.h | 34 - + fs/ntfs/endian.h | 79 - + fs/ntfs/file.c | 1997 ------------ + fs/ntfs/index.c | 440 --- + fs/ntfs/index.h | 134 - + fs/ntfs/inode.c | 3102 ------------------- + fs/ntfs/inode.h | 310 -- + fs/ntfs/layout.h | 2421 --------------- + fs/ntfs/lcnalloc.c | 1000 ------ + fs/ntfs/lcnalloc.h | 131 - + fs/ntfs/logfile.c | 849 ------ + fs/ntfs/logfile.h | 295 -- + fs/ntfs/malloc.h | 77 - + fs/ntfs/mft.c | 2907 ------------------ + fs/ntfs/mft.h | 110 - + fs/ntfs/mst.c | 189 -- + fs/ntfs/namei.c | 392 --- + fs/ntfs/ntfs.h | 150 - + fs/ntfs/quota.c | 103 - + fs/ntfs/quota.h | 21 - + fs/ntfs/runlist.c | 1893 ------------ + fs/ntfs/runlist.h | 88 - + fs/ntfs/super.c | 3202 -------------------- + fs/ntfs/sysctl.c | 58 - + fs/ntfs/sysctl.h | 27 - + fs/ntfs/time.h | 89 - + fs/ntfs/types.h | 55 - + fs/ntfs/unistr.c | 384 --- + fs/ntfs/upcase.c | 73 - + fs/ntfs/usnjrnl.c | 70 - + fs/ntfs/usnjrnl.h | 191 -- + fs/ntfs/volume.h | 164 - + fs/ntfs3/namei.c | 2 +- + fs/pipe.c | 81 +- + fs/select.c | 13 +- + fs/sysv/itree.c | 10 +- + include/asm-generic/barrier.h | 2 - + include/linux/backing-dev.h | 1 - + include/linux/fs.h | 18 +- + include/linux/pid.h | 4 +- + include/uapi/linux/fs.h | 5 +- + include/uapi/linux/pidfd.h | 3 +- + init/initramfs.c | 2 - + kernel/exit.c | 7 + + kernel/fork.c | 51 +- + kernel/pid.c | 16 +- + kernel/signal.c | 12 +- + lib/iov_iter.c | 55 +- + mm/backing-dev.c | 25 - + mm/filemap.c | 9 - + .../selftests/filesystems/overlayfs/dev_in_maps.c | 10 +- + .../move_mount_set_group_test.c | 4 +- + 83 files changed, 225 insertions(+), 29489 deletions(-) + delete mode 100644 Documentation/filesystems/ntfs.rst + delete mode 100644 fs/ntfs/Kconfig + delete mode 100644 fs/ntfs/Makefile + delete mode 100644 fs/ntfs/aops.c + delete mode 100644 fs/ntfs/aops.h + delete mode 100644 fs/ntfs/attrib.c + delete mode 100644 fs/ntfs/attrib.h + delete mode 100644 fs/ntfs/bitmap.c + delete mode 100644 fs/ntfs/bitmap.h + delete mode 100644 fs/ntfs/collate.c + delete mode 100644 fs/ntfs/collate.h + delete mode 100644 fs/ntfs/compress.c + delete mode 100644 fs/ntfs/debug.c + delete mode 100644 fs/ntfs/debug.h + delete mode 100644 fs/ntfs/dir.c + delete mode 100644 fs/ntfs/dir.h + delete mode 100644 fs/ntfs/endian.h + delete mode 100644 fs/ntfs/file.c + delete mode 100644 fs/ntfs/index.c + delete mode 100644 fs/ntfs/index.h + delete mode 100644 fs/ntfs/inode.c + delete mode 100644 fs/ntfs/inode.h + delete mode 100644 fs/ntfs/layout.h + delete mode 100644 fs/ntfs/lcnalloc.c + delete mode 100644 fs/ntfs/lcnalloc.h + delete mode 100644 fs/ntfs/logfile.c + delete mode 100644 fs/ntfs/logfile.h + delete mode 100644 fs/ntfs/malloc.h + delete mode 100644 fs/ntfs/mft.c + delete mode 100644 fs/ntfs/mft.h + delete mode 100644 fs/ntfs/mst.c + delete mode 100644 fs/ntfs/namei.c + delete mode 100644 fs/ntfs/ntfs.h + delete mode 100644 fs/ntfs/quota.c + delete mode 100644 fs/ntfs/quota.h + delete mode 100644 fs/ntfs/runlist.c + delete mode 100644 fs/ntfs/runlist.h + delete mode 100644 fs/ntfs/super.c + delete mode 100644 fs/ntfs/sysctl.c + delete mode 100644 fs/ntfs/sysctl.h + delete mode 100644 fs/ntfs/time.h + delete mode 100644 fs/ntfs/types.h + delete mode 100644 fs/ntfs/unistr.c + delete mode 100644 fs/ntfs/upcase.c + delete mode 100644 fs/ntfs/usnjrnl.c + delete mode 100644 fs/ntfs/usnjrnl.h + delete mode 100644 fs/ntfs/volume.h +Merging vfs/for-next (052d534373b7 Merge tag 'exfat-for-6.8-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/linkinjeon/exfat) +$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs.git vfs/for-next +Already up to date. +Merging printk/for-next (6c3a34e38436 Merge branch 'for-6.8' into for-next) +$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/printk/linux.git printk/for-next +Merge made by the 'ort' strategy. +Merging pci/next (95bf9132f8b4 Merge branch 'pci/dpc') +$ git merge -m Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/pci/pci.git pci/next +Merge made by the 'ort' strategy. + drivers/pci/pcie/dpc.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) +Merging pstore/for-next/pstore (24a0b5e196cf pstore: inode: Use cleanup.h for struct pstore_private) +$ git merge -m Merge branch 'for-next/pstore' of git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git pstore/for-next/pstore +Already up to date. +Merging hid/for-next (a54f72c74c2d Merge branch 'for-6.8/upstream-fixes' into for-next) +$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/hid/hid.git hid/for-next +Merge made by the 'ort' strategy. + drivers/hid/bpf/hid_bpf_dispatch.c | 117 ++++-- + drivers/hid/bpf/hid_bpf_dispatch.h | 4 +- + drivers/hid/bpf/hid_bpf_jmp_table.c | 40 +- + drivers/hid/hid-ids.h | 10 + + drivers/hid/hid-lenovo.c | 57 ++- + drivers/hid/hid-logitech-hidpp.c | 2 + + drivers/hid/hid-nintendo.c | 10 - + drivers/hid/hid-nvidia-shield.c | 4 + + drivers/hid/hid-samsung.c | 437 +++++++++++++++++++-- + drivers/hid/hid-steam.c | 36 +- + drivers/hid/hidraw.c | 7 +- + drivers/hid/i2c-hid/i2c-hid-core.c | 6 +- + drivers/hid/i2c-hid/i2c-hid-of.c | 1 + + include/linux/hid_bpf.h | 11 - + .../selftests/hid/tests/test_wacom_generic.py | 8 +- + 15 files changed, 593 insertions(+), 157 deletions(-) +Merging i2c/i2c/for-next (41bccc98fb79 Linux 6.8-rc2) +$ git merge -m Merge branch 'i2c/for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/wsa/linux.git i2c/i2c/for-next +Already up to date. +Merging i2c-host/i2c/i2c-host (11f1357336cd i2c: imx: move to generic GPIO recovery) +$ git merge -m Merge branch 'i2c/i2c-host' of git://git.kernel.org/pub/scm/linux/kernel/git/andi.shyti/linux.git i2c-host/i2c/i2c-host +Merge made by the 'ort' strategy. + .../devicetree/bindings/i2c/i2c-mux-pca954x.yaml | 30 +++++++++++ + drivers/i2c/busses/i2c-i801.c | 12 ++--- + drivers/i2c/busses/i2c-imx.c | 62 ++-------------------- + drivers/i2c/muxes/i2c-mux-pca954x.c | 43 ++++++++++++++- + 4 files changed, 82 insertions(+), 65 deletions(-) +Merging i3c/i3c/next (4fa0888f6f3e i3c: document hotjoin sysfs entry) +$ git merge -m Merge branch 'i3c/next' of git://git.kernel.org/pub/scm/linux/kernel/git/i3c/linux.git i3c/i3c/next +Already up to date. +Merging hwmon-staging/hwmon-next (6120fec68e78 hwmon: ltc4282: add support for the LTC4282 chip) +$ git merge -m Merge branch 'hwmon-next' of git://git.kernel.org/pub/scm/linux/kernel/git/groeck/linux-staging.git hwmon-staging/hwmon-next +Auto-merging MAINTAINERS +Merge made by the 'ort' strategy. + Documentation/ABI/testing/sysfs-class-hwmon | 9 + + .../devicetree/bindings/hwmon/adi,ltc4282.yaml | 159 ++ + .../devicetree/bindings/hwmon/ti,ina2xx.yaml | 9 + + Documentation/hwmon/emc2305.rst | 1 - + Documentation/hwmon/index.rst | 1 + + Documentation/hwmon/ltc4282.rst | 133 ++ + Documentation/hwmon/nct6683.rst | 1 + + MAINTAINERS | 26 +- + drivers/hwmon/Kconfig | 11 + + drivers/hwmon/Makefile | 1 + + drivers/hwmon/adm1177.c | 1 - + drivers/hwmon/adt7410.c | 2 - + drivers/hwmon/ds1621.c | 1 - + drivers/hwmon/ds620.c | 1 - + drivers/hwmon/emc2305.c | 5 - + drivers/hwmon/hwmon.c | 1 + + drivers/hwmon/ina209.c | 1 - + drivers/hwmon/ina238.c | 1 - + drivers/hwmon/ltc4282.c | 1784 ++++++++++++++++++++ + drivers/hwmon/max127.c | 1 - + drivers/hwmon/max31760.c | 1 - + drivers/hwmon/max31790.c | 1 - + drivers/hwmon/max31827.c | 1 - + drivers/hwmon/max6621.c | 1 - + drivers/hwmon/max6697.c | 1 - + drivers/hwmon/nct6683.c | 3 + + drivers/hwmon/occ/p8_i2c.c | 1 - + drivers/hwmon/pmbus/ir36021.c | 1 - + drivers/hwmon/pmbus/pmbus_core.c | 2 +- + drivers/hwmon/powr1220.c | 1 - + drivers/hwmon/sbrmi.c | 1 - + drivers/hwmon/sbtsi_temp.c | 1 - + drivers/hwmon/w83773g.c | 1 - + include/linux/hwmon.h | 14 +- + 34 files changed, 2129 insertions(+), 50 deletions(-) + create mode 100644 Documentation/devicetree/bindings/hwmon/adi,ltc4282.yaml + create mode 100644 Documentation/hwmon/ltc4282.rst + create mode 100644 drivers/hwmon/ltc4282.c +Merging jc_docs/docs-next (5c7944ca7b13 coding-style: Add guidance to prefer dev_dbg) +$ git merge -m Merge branch 'docs-next' of git://git.lwn.net/linux.git jc_docs/docs-next +Merge made by the 'ort' strategy. + Documentation/RCU/torture.rst | 2 +- + Documentation/doc-guide/kernel-doc.rst | 45 ++ + Documentation/doc-guide/maintainer-profile.rst | 7 + + Documentation/driver-api/index.rst | 209 +++--- + Documentation/process/coding-style.rst | 3 +- + Documentation/subsystem-apis.rst | 2 + + Documentation/translations/it_IT/RCU/index.rst | 19 + + Documentation/translations/it_IT/RCU/torture.rst | 369 +++++++++ + .../translations/it_IT/core-api/index.rst | 12 + + Documentation/translations/it_IT/index.rst | 1 + + Documentation/translations/it_IT/locking/index.rst | 20 + + .../translations/it_IT/locking/lockdep-design.rst | 678 +++++++++++++++++ + .../translations/it_IT/locking/lockstat.rst | 230 ++++++ + .../translations/it_IT/locking/locktorture.rst | 181 +++++ + .../translations/it_IT/locking/locktypes.rst | 547 ++++++++++++++ + Documentation/userspace-api/index.rst | 47 +- + Documentation/userspace-api/perf_ring_buffer.rst | 830 +++++++++++++++++++++ + drivers/gpu/drm/drm_gem_vram_helper.c | 44 +- + include/drm/drm_gem_vram_helper.h | 16 +- + scripts/kernel-doc | 4 +- + scripts/sphinx-pre-install | 9 +- + 21 files changed, 3138 insertions(+), 137 deletions(-) + create mode 100644 Documentation/translations/it_IT/RCU/index.rst + create mode 100644 Documentation/translations/it_IT/RCU/torture.rst + create mode 100644 Documentation/translations/it_IT/locking/index.rst + create mode 100644 Documentation/translations/it_IT/locking/lockdep-design.rst + create mode 100644 Documentation/translations/it_IT/locking/lockstat.rst + create mode 100644 Documentation/translations/it_IT/locking/locktorture.rst + create mode 100644 Documentation/translations/it_IT/locking/locktypes.rst + create mode 100644 Documentation/userspace-api/perf_ring_buffer.rst +Merging v4l-dvb/master (6613476e225e Linux 6.8-rc1) +$ git merge -m Merge branch 'master' of git://linuxtv.org/media_tree.git v4l-dvb/master +Already up to date. +Merging v4l-dvb-next/master (04447d48afd3 media: mediatek: vcodec: drop excess struct members descriptions) +$ git merge -m Merge branch 'master' of git://linuxtv.org/mchehab/media-next.git v4l-dvb-next/master +Merge made by the 'ort' strategy. + drivers/media/i2c/alvium-csi2.c | 2 +- + drivers/media/i2c/ar0521.c | 6 +- + drivers/media/mc/mc-devnode.c | 1 - + drivers/media/pci/intel/ipu3/ipu3-cio2.c | 22 ++----- + drivers/media/platform/cadence/cdns-csi2rx.c | 19 +++++- + .../mediatek/vcodec/decoder/vdec/vdec_vp8_req_if.c | 1 - + .../vcodec/decoder/vdec/vdec_vp9_req_lat_if.c | 1 - + .../platform/samsung/exynos4-is/fimc-capture.c | 52 +++++++-------- + .../media/platform/samsung/exynos4-is/fimc-core.c | 23 +++---- + .../media/platform/samsung/exynos4-is/fimc-core.h | 23 ++++--- + .../platform/samsung/exynos4-is/fimc-isp-video.c | 2 +- + .../platform/samsung/exynos4-is/fimc-lite-reg.c | 13 ++-- + .../platform/samsung/exynos4-is/fimc-lite-reg.h | 12 ++-- + .../media/platform/samsung/exynos4-is/fimc-lite.c | 2 +- + .../media/platform/samsung/exynos4-is/fimc-m2m.c | 23 +++---- + .../media/platform/samsung/exynos4-is/fimc-reg.c | 38 +++++------ + .../media/platform/samsung/exynos4-is/fimc-reg.h | 10 +-- + drivers/media/platform/samsung/s5p-mfc/s5p_mfc.c | 76 +++++++++++----------- + .../media/platform/samsung/s5p-mfc/s5p_mfc_cmd.c | 8 +-- + .../media/platform/samsung/s5p-mfc/s5p_mfc_cmd.h | 2 +- + .../platform/samsung/s5p-mfc/s5p_mfc_cmd_v5.c | 6 +- + .../platform/samsung/s5p-mfc/s5p_mfc_cmd_v5.h | 2 +- + .../platform/samsung/s5p-mfc/s5p_mfc_cmd_v6.c | 8 +-- + .../platform/samsung/s5p-mfc/s5p_mfc_cmd_v6.h | 2 +- + .../platform/samsung/s5p-mfc/s5p_mfc_common.h | 14 ++-- + .../media/platform/samsung/s5p-mfc/s5p_mfc_ctrl.c | 26 ++++---- + .../media/platform/samsung/s5p-mfc/s5p_mfc_dec.c | 20 +++--- + .../media/platform/samsung/s5p-mfc/s5p_mfc_dec.h | 3 +- + .../media/platform/samsung/s5p-mfc/s5p_mfc_enc.c | 12 ++-- + .../media/platform/samsung/s5p-mfc/s5p_mfc_enc.h | 3 +- + .../media/platform/samsung/s5p-mfc/s5p_mfc_opr.c | 7 +- + .../platform/samsung/s5p-mfc/s5p_mfc_opr_v5.c | 28 ++++---- + .../platform/samsung/s5p-mfc/s5p_mfc_opr_v5.h | 2 +- + .../platform/samsung/s5p-mfc/s5p_mfc_opr_v6.c | 36 +++++----- + .../platform/samsung/s5p-mfc/s5p_mfc_opr_v6.h | 2 +- + .../media/platform/samsung/s5p-mfc/s5p_mfc_pm.c | 51 ++++++--------- + .../media/platform/samsung/s5p-mfc/s5p_mfc_pm.h | 8 +-- + .../media/platform/ti/j721e-csi2rx/j721e-csi2rx.c | 24 +++++++ + drivers/media/platform/xilinx/Kconfig | 4 +- + drivers/media/v4l2-core/v4l2-mc.c | 23 +++++-- + .../staging/media/ipu3/include/uapi/intel-ipu3.h | 3 - + drivers/staging/media/ipu3/ipu3-v4l2.c | 16 ++--- + include/media/media-entity.h | 4 -- + 43 files changed, 326 insertions(+), 314 deletions(-) +Merging pm/linux-next (7543bfcb6b1a Merge branch 'thermal-core' into linux-next) +$ git merge -m Merge branch 'linux-next' of git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm.git pm/linux-next +Merge made by the 'ort' strategy. + drivers/base/power/main.c | 74 ++++++++++++++++++---------------------- + drivers/thermal/gov_bang_bang.c | 2 +- + drivers/thermal/gov_fair_share.c | 16 +++++---- + include/linux/pm.h | 30 ++++++++-------- + 4 files changed, 59 insertions(+), 63 deletions(-) +Merging cpufreq-arm/cpufreq/arm/linux-next (eaffb10b51bf cpufreq: mediatek-hw: Don't error out if supply is not found) +$ git merge -m Merge branch 'cpufreq/arm/linux-next' of git://git.kernel.org/pub/scm/linux/kernel/git/vireshk/pm.git cpufreq-arm/cpufreq/arm/linux-next +Merge made by the 'ort' strategy. + Documentation/power/opp.rst | 2 +- + Documentation/translations/zh_CN/power/opp.rst | 2 +- + drivers/cpufreq/brcmstb-avs-cpufreq.c | 2 ++ + drivers/cpufreq/imx6q-cpufreq.c | 43 +++++++++----------------- + drivers/cpufreq/mediatek-cpufreq-hw.c | 19 +++++++++++- + 5 files changed, 36 insertions(+), 32 deletions(-) +Merging cpupower/cpupower (0086ffec768b tools cpupower bench: Override CFLAGS assignments) +$ git merge -m Merge branch 'cpupower' of git://git.kernel.org/pub/scm/linux/kernel/git/shuah/linux.git cpupower/cpupower +Already up to date. +Merging devfreq/devfreq-next (aed5ed595960 PM / devfreq: Synchronize devfreq_monitor_[start/stop]) +$ git merge -m Merge branch 'devfreq-next' of git://git.kernel.org/pub/scm/linux/kernel/git/chanwoo/linux.git devfreq/devfreq-next +Already up to date. +Merging pmdomain/next (90a7463fae9e pmdomain: renesas: r8a779h0-sysc: Add r8a779h0 support) +$ git merge -m Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/ulfh/linux-pm.git pmdomain/next +Merge made by the 'ort' strategy. + drivers/pmdomain/core.c | 133 ++++++++++++++++++------------ + drivers/pmdomain/imx/imx8m-blk-ctrl.c | 9 +- + drivers/pmdomain/imx/imx8mp-blk-ctrl.c | 9 +- + drivers/pmdomain/qcom/rpmpd.c | 13 ++- + drivers/pmdomain/renesas/Kconfig | 4 + + drivers/pmdomain/renesas/Makefile | 1 + + drivers/pmdomain/renesas/r8a779a0-sysc.c | 12 --- + drivers/pmdomain/renesas/r8a779f0-sysc.c | 12 --- + drivers/pmdomain/renesas/r8a779g0-sysc.c | 12 --- + drivers/pmdomain/renesas/r8a779h0-sysc.c | 54 ++++++++++++ + drivers/pmdomain/renesas/rcar-gen4-sysc.c | 3 + + drivers/pmdomain/renesas/rcar-gen4-sysc.h | 1 + + drivers/pmdomain/ti/omap_prm.c | 2 + + 13 files changed, 165 insertions(+), 100 deletions(-) + create mode 100644 drivers/pmdomain/renesas/r8a779h0-sysc.c +Merging opp/opp/linux-next (ace4b31b297d cpufreq: Move dev_pm_opp_{init|free}_cpufreq_table() to pm_opp.h) +$ git merge -m Merge branch 'opp/linux-next' of git://git.kernel.org/pub/scm/linux/kernel/git/vireshk/pm.git opp/opp/linux-next +Merge made by the 'ort' strategy. + include/linux/cpufreq.h | 20 -------------------- + include/linux/pm_opp.h | 16 ++++++++++++++++ + 2 files changed, 16 insertions(+), 20 deletions(-) +Merging thermal/thermal/linux-next (5314b1543787 thermal/drivers/exynos: Use set_trips ops) +$ git merge -m Merge branch 'thermal/linux-next' of git://git.kernel.org/pub/scm/linux/kernel/git/thermal/linux.git thermal/thermal/linux-next +Already up to date. +Merging dlm/next (5beebc1dda47 dlm: update format header reflect current format) +$ git merge -m Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/teigland/linux-dlm.git dlm/next +Already up to date. +Merging rdma/for-next (a400073ce3dd RDMA/mlx5: Delete unused mlx5_ib_copy_pas prototype) +$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git rdma/for-next +Merge made by the 'ort' strategy. + drivers/infiniband/hw/cxgb4/iw_cxgb4.h | 2 - + drivers/infiniband/hw/hfi1/tid_rdma.c | 25 +- + drivers/infiniband/hw/hns/hns_roce_cq.c | 11 +- + drivers/infiniband/hw/hns/hns_roce_device.h | 16 +- + drivers/infiniband/hw/hns/hns_roce_hem.c | 95 ++----- + drivers/infiniband/hw/hns/hns_roce_hem.h | 56 +--- + drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 111 ++++---- + drivers/infiniband/hw/hns/hns_roce_mr.c | 341 ++++++++++++++++++------- + drivers/infiniband/hw/mana/cq.c | 25 +- + drivers/infiniband/hw/mana/main.c | 40 +-- + drivers/infiniband/hw/mana/mana_ib.h | 20 +- + drivers/infiniband/hw/mana/mr.c | 13 +- + drivers/infiniband/hw/mana/qp.c | 88 ++----- + drivers/infiniband/hw/mlx5/mlx5_ib.h | 1 - + drivers/infiniband/sw/rxe/rxe.c | 6 +- + drivers/infiniband/sw/rxe/rxe.h | 6 +- + drivers/infiniband/sw/rxe/rxe_comp.c | 4 +- + drivers/infiniband/sw/rxe/rxe_cq.c | 4 +- + drivers/infiniband/sw/rxe/rxe_mr.c | 16 +- + drivers/infiniband/sw/rxe/rxe_mw.c | 2 +- + drivers/infiniband/sw/rxe/rxe_qp.c | 8 +- + drivers/infiniband/sw/rxe/rxe_resp.c | 12 +- + drivers/infiniband/sw/rxe/rxe_task.c | 4 +- + drivers/infiniband/sw/rxe/rxe_verbs.c | 216 ++++++++-------- + drivers/infiniband/ulp/ipoib/ipoib_multicast.c | 3 +- + 25 files changed, 576 insertions(+), 549 deletions(-) +Merging net-next/main (e7f8df0e81bf dpll: move xa_erase() call in to match dpll_pin_alloc() error path order) +$ git merge -m Merge branch 'main' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git net-next/main +Auto-merging .mailmap +Auto-merging MAINTAINERS +Auto-merging drivers/net/dsa/mt7530.c +Auto-merging drivers/net/ethernet/google/gve/gve_rx.c +Auto-merging drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +Auto-merging drivers/net/wireless/microchip/wilc1000/netdev.c +Merge made by the 'ort' strategy. + .mailmap | 1 + + .../bpf/standardization/instruction-set.rst | 80 +- + Documentation/bpf/verifier.rst | 2 +- + Documentation/dev-tools/kselftest.rst | 12 + + Documentation/devicetree/bindings/leds/common.yaml | 12 + + .../devicetree/bindings/leds/leds-bcm63138.yaml | 4 - + .../devicetree/bindings/leds/leds-bcm6328.yaml | 4 - + .../devicetree/bindings/leds/leds-bcm6358.txt | 2 - + .../bindings/leds/leds-pwm-multicolor.yaml | 4 - + .../devicetree/bindings/leds/leds-pwm.yaml | 5 - + .../devicetree/bindings/net/nfc/ti,trf7970a.yaml | 2 +- + .../devicetree/bindings/net/qca,qca808x.yaml | 54 + + .../devicetree/bindings/net/snps,dwmac.yaml | 11 +- + .../bindings/net/starfive,jh7110-dwmac.yaml | 72 +- + Documentation/networking/devlink/mlx5.rst | 4 + + MAINTAINERS | 10 + + arch/arm64/net/bpf_jit_comp.c | 5 + + arch/x86/net/bpf_jit_comp.c | 5 + + drivers/dpll/dpll_core.c | 2 +- + drivers/media/rc/bpf-lirc.c | 2 +- + drivers/net/arcnet/arcnet.c | 1 + + drivers/net/dsa/Kconfig | 2 +- + drivers/net/dsa/b53/b53_common.c | 10 +- + drivers/net/dsa/b53/b53_priv.h | 6 +- + drivers/net/dsa/bcm_sf2.c | 2 +- + drivers/net/dsa/microchip/ksz8795.c | 410 +++++--- + drivers/net/dsa/microchip/ksz8795_reg.h | 1 + + drivers/net/dsa/microchip/ksz_common.c | 4 +- + drivers/net/dsa/mt7530-mdio.c | 7 +- + drivers/net/dsa/mt7530.c | 173 ++-- + drivers/net/dsa/mt7530.h | 16 +- + drivers/net/dsa/mv88e6xxx/chip.c | 4 +- + drivers/net/dsa/qca/qca8k-common.c | 4 +- + drivers/net/dsa/qca/qca8k.h | 4 +- + .../net/ethernet/aquantia/atlantic/aq_ethtool.c | 12 +- + drivers/net/ethernet/broadcom/asp2/bcmasp.h | 2 +- + .../net/ethernet/broadcom/asp2/bcmasp_ethtool.c | 8 +- + drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c | 9 +- + .../net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c | 14 +- + drivers/net/ethernet/broadcom/bnx2x/bnx2x_link.c | 14 +- + drivers/net/ethernet/broadcom/bnxt/bnxt.c | 20 +- + drivers/net/ethernet/broadcom/bnxt/bnxt.h | 2 +- + drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 20 +- + drivers/net/ethernet/broadcom/genet/bcmgenet.c | 8 +- + drivers/net/ethernet/broadcom/genet/bcmgenet.h | 2 +- + drivers/net/ethernet/broadcom/tg3.c | 32 +- + drivers/net/ethernet/broadcom/tg3.h | 2 +- + drivers/net/ethernet/ec_bhf.c | 1 + + drivers/net/ethernet/engleder/tsnep_main.c | 10 +- + drivers/net/ethernet/freescale/enetc/enetc.c | 4 +- + drivers/net/ethernet/freescale/fec.h | 2 +- + drivers/net/ethernet/freescale/fec_main.c | 10 +- + drivers/net/ethernet/freescale/gianfar.c | 4 +- + drivers/net/ethernet/google/gve/gve.h | 144 ++- + drivers/net/ethernet/google/gve/gve_dqo.h | 18 +- + drivers/net/ethernet/google/gve/gve_main.c | 862 ++++++++++------ + drivers/net/ethernet/google/gve/gve_rx.c | 135 ++- + drivers/net/ethernet/google/gve/gve_rx_dqo.c | 91 +- + drivers/net/ethernet/google/gve/gve_tx.c | 128 ++- + drivers/net/ethernet/google/gve/gve_tx_dqo.c | 108 +- + drivers/net/ethernet/google/gve/gve_utils.c | 31 + + drivers/net/ethernet/google/gve/gve_utils.h | 5 + + drivers/net/ethernet/intel/e1000e/ethtool.c | 16 +- + drivers/net/ethernet/intel/i40e/i40e_ethtool.c | 16 +- + drivers/net/ethernet/intel/ice/ice.h | 1 - + drivers/net/ethernet/intel/ice/ice_ethtool.c | 2 +- + drivers/net/ethernet/intel/ice/ice_main.c | 4 +- + drivers/net/ethernet/intel/ice/ice_ptp.c | 233 +++-- + drivers/net/ethernet/intel/ice/ice_ptp.h | 34 +- + drivers/net/ethernet/intel/igb/igb_ethtool.c | 28 +- + drivers/net/ethernet/intel/igc/igc.h | 2 +- + drivers/net/ethernet/intel/igc/igc_ethtool.c | 20 +- + drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c | 28 +- + drivers/net/ethernet/marvell/mvneta.c | 4 +- + drivers/net/ethernet/marvell/octeontx2/af/mbox.h | 1 + + drivers/net/ethernet/marvell/octeontx2/af/npc.h | 15 +- + .../ethernet/marvell/octeontx2/af/npc_profile.h | 621 ++++++++++-- + .../net/ethernet/marvell/octeontx2/af/rvu_nix.c | 7 + + .../mellanox/mlxsw/core_acl_flex_actions.c | 16 +- + .../ethernet/mellanox/mlxsw/core_acl_flex_keys.c | 9 +- + drivers/net/ethernet/mellanox/mlxsw/minimal.c | 1 - + drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 160 +-- + drivers/net/ethernet/mellanox/mlxsw/spectrum.h | 15 +- + drivers/net/ethernet/mellanox/mlxsw/spectrum_acl.c | 11 +- + .../ethernet/mellanox/mlxsw/spectrum_acl_tcam.c | 17 +- + .../net/ethernet/mellanox/mlxsw/spectrum_router.c | 15 +- + .../ethernet/mellanox/mlxsw/spectrum_switchdev.c | 8 +- + drivers/net/ethernet/microchip/encx24j600-regmap.c | 1 + + drivers/net/ethernet/microchip/lan743x_ethtool.c | 4 +- + .../microchip/lan966x/lan966x_vcap_debugfs.c | 2 + + drivers/net/ethernet/mscc/ocelot.c | 1 + + drivers/net/ethernet/qlogic/qede/qede_ethtool.c | 32 +- + drivers/net/ethernet/qualcomm/emac/emac.c | 1 + + drivers/net/ethernet/qualcomm/qca_7k.c | 17 +- + drivers/net/ethernet/qualcomm/qca_7k.h | 16 +- + drivers/net/ethernet/qualcomm/qca_7k_common.c | 17 +- + drivers/net/ethernet/qualcomm/qca_7k_common.h | 29 +- + drivers/net/ethernet/qualcomm/qca_debug.c | 21 +- + drivers/net/ethernet/qualcomm/qca_debug.h | 15 +- + drivers/net/ethernet/qualcomm/qca_spi.c | 71 +- + drivers/net/ethernet/qualcomm/qca_spi.h | 22 +- + drivers/net/ethernet/qualcomm/qca_uart.c | 17 +- + drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c | 1 + + drivers/net/ethernet/realtek/r8169_main.c | 4 +- + drivers/net/ethernet/samsung/sxgbe/sxgbe_ethtool.c | 4 +- + drivers/net/ethernet/smsc/smc91x.c | 1 + + drivers/net/ethernet/smsc/smsc911x.c | 1 + + drivers/net/ethernet/smsc/smsc9420.c | 1 + + drivers/net/ethernet/stmicro/stmmac/Kconfig | 6 +- + drivers/net/ethernet/stmicro/stmmac/common.h | 2 + + .../net/ethernet/stmicro/stmmac/dwmac-socfpga.c | 1 + + .../net/ethernet/stmicro/stmmac/dwmac-starfive.c | 32 +- + drivers/net/ethernet/stmicro/stmmac/stmmac_est.c | 6 + + .../net/ethernet/stmicro/stmmac/stmmac_ethtool.c | 4 +- + drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 22 + + drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c | 87 +- + drivers/net/ethernet/ti/am65-cpsw-ethtool.c | 4 +- + drivers/net/ethernet/ti/cpsw-common.c | 1 + + drivers/net/ethernet/ti/cpsw_ethtool.c | 4 +- + drivers/net/ethernet/ti/cpsw_priv.h | 4 +- + drivers/net/ethernet/ti/icssg/icssg_ethtool.c | 4 +- + drivers/net/ethernet/wangxun/libwx/wx_hw.c | 2 - + drivers/net/ethernet/wangxun/libwx/wx_lib.c | 20 +- + drivers/net/ethernet/wangxun/libwx/wx_type.h | 1 - + drivers/net/ethernet/wangxun/txgbe/Makefile | 1 + + drivers/net/ethernet/wangxun/txgbe/txgbe_irq.c | 269 +++++ + drivers/net/ethernet/wangxun/txgbe/txgbe_irq.h | 7 + + drivers/net/ethernet/wangxun/txgbe/txgbe_main.c | 140 +-- + drivers/net/ethernet/wangxun/txgbe/txgbe_phy.c | 59 +- + drivers/net/ethernet/wangxun/txgbe/txgbe_phy.h | 2 + + drivers/net/ethernet/wangxun/txgbe/txgbe_type.h | 17 + + drivers/net/pcs/pcs-lynx.c | 1 + + drivers/net/pcs/pcs-mtk-lynxi.c | 1 + + drivers/net/pcs/pcs-xpcs.c | 1 + + drivers/net/phy/at803x.c | 327 ++++++ + drivers/net/phy/marvell.c | 2 +- + drivers/net/phy/micrel.c | 61 +- + drivers/net/phy/phy-c45.c | 44 +- + drivers/net/phy/phy.c | 8 +- + drivers/net/phy/phy_device.c | 16 + + drivers/net/phy/phylink.c | 8 +- + drivers/net/tun.c | 7 +- + drivers/net/usb/ax88179_178a.c | 20 +- + drivers/net/usb/lan78xx.c | 4 +- + drivers/net/usb/r8152.c | 28 +- + drivers/net/wireless/broadcom/b43/b43.h | 16 + + drivers/net/wireless/broadcom/b43/dma.c | 4 +- + drivers/net/wireless/broadcom/b43/main.c | 16 +- + drivers/net/wireless/broadcom/b43/pio.c | 6 +- + .../broadcom/brcm80211/brcmfmac/bca/core.c | 30 +- + .../broadcom/brcm80211/brcmfmac/cfg80211.c | 64 +- + .../broadcom/brcm80211/brcmfmac/cfg80211.h | 2 + + .../wireless/broadcom/brcm80211/brcmfmac/common.c | 18 +- + .../wireless/broadcom/brcm80211/brcmfmac/core.c | 12 +- + .../wireless/broadcom/brcm80211/brcmfmac/core.h | 2 +- + .../broadcom/brcm80211/brcmfmac/cyw/core.c | 50 +- + .../wireless/broadcom/brcm80211/brcmfmac/feature.c | 11 +- + .../wireless/broadcom/brcm80211/brcmfmac/fweh.c | 154 ++- + .../wireless/broadcom/brcm80211/brcmfmac/fweh.h | 60 +- + .../wireless/broadcom/brcm80211/brcmfmac/fwil.c | 116 +-- + .../wireless/broadcom/brcm80211/brcmfmac/fwil.h | 125 ++- + .../broadcom/brcm80211/brcmfmac/fwil_types.h | 2 +- + .../wireless/broadcom/brcm80211/brcmfmac/fwvid.c | 13 +- + .../wireless/broadcom/brcm80211/brcmfmac/fwvid.h | 48 +- + .../broadcom/brcm80211/brcmfmac/wcc/core.c | 31 +- + .../broadcom/brcm80211/brcmsmac/phy/phy_cmn.c | 3 +- + .../broadcom/brcm80211/brcmsmac/phy/phy_int.h | 2 +- + .../broadcom/brcm80211/brcmsmac/phy/phy_n.c | 11 +- + drivers/net/wireless/intel/iwlegacy/common.c | 4 +- + drivers/net/wireless/marvell/mwifiex/cfg80211.c | 2 +- + drivers/net/wireless/marvell/mwifiex/debugfs.c | 3 - + drivers/net/wireless/marvell/mwifiex/wmm.c | 2 +- + drivers/net/wireless/microchip/wilc1000/cfg80211.c | 12 +- + drivers/net/wireless/microchip/wilc1000/hif.c | 40 +- + drivers/net/wireless/microchip/wilc1000/netdev.c | 12 +- + drivers/net/wireless/microchip/wilc1000/wlan.c | 35 +- + drivers/net/wireless/microchip/wilc1000/wlan.h | 6 + + drivers/net/wireless/ralink/rt2x00/rt2x00crypto.c | 5 +- + drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu.h | 20 +- + .../net/wireless/realtek/rtl8xxxu/rtl8xxxu_8188e.c | 3 +- + .../net/wireless/realtek/rtl8xxxu/rtl8xxxu_8188f.c | 2 + + .../net/wireless/realtek/rtl8xxxu/rtl8xxxu_8192c.c | 1 + + .../net/wireless/realtek/rtl8xxxu/rtl8xxxu_8192e.c | 1 + + .../net/wireless/realtek/rtl8xxxu/rtl8xxxu_8192f.c | 33 +- + .../net/wireless/realtek/rtl8xxxu/rtl8xxxu_8710b.c | 1 + + .../net/wireless/realtek/rtl8xxxu/rtl8xxxu_8723a.c | 1 + + .../net/wireless/realtek/rtl8xxxu/rtl8xxxu_8723b.c | 1 + + .../net/wireless/realtek/rtl8xxxu/rtl8xxxu_core.c | 409 ++++++-- + .../net/wireless/realtek/rtl8xxxu/rtl8xxxu_regs.h | 15 + + drivers/net/wireless/realtek/rtlwifi/efuse.c | 36 +- + drivers/net/wireless/realtek/rtlwifi/efuse.h | 4 +- + drivers/net/wireless/realtek/rtlwifi/pci.c | 12 +- + .../net/wireless/realtek/rtlwifi/rtl8192ce/trx.c | 4 - + .../net/wireless/realtek/rtlwifi/rtl8192cu/sw.c | 6 +- + .../net/wireless/realtek/rtlwifi/rtl8192cu/trx.c | 3 - + .../net/wireless/realtek/rtlwifi/rtl8192de/trx.c | 5 +- + .../net/wireless/realtek/rtlwifi/rtl8723ae/trx.c | 6 +- + drivers/net/wireless/realtek/rtlwifi/usb.c | 164 +-- + drivers/net/wireless/realtek/rtlwifi/wifi.h | 38 +- + drivers/net/wireless/realtek/rtw88/debug.c | 44 +- + drivers/net/wireless/realtek/rtw88/pci.c | 4 + + drivers/net/wireless/realtek/rtw88/reg.h | 3 + + drivers/net/wireless/realtek/rtw89/cam.c | 61 ++ + drivers/net/wireless/realtek/rtw89/cam.h | 109 ++ + drivers/net/wireless/realtek/rtw89/chan.c | 2 +- + drivers/net/wireless/realtek/rtw89/core.c | 344 +++++-- + drivers/net/wireless/realtek/rtw89/core.h | 136 ++- + drivers/net/wireless/realtek/rtw89/fw.c | 944 ++++++++++++++++-- + drivers/net/wireless/realtek/rtw89/fw.h | 810 ++++++++------- + drivers/net/wireless/realtek/rtw89/mac.c | 96 +- + drivers/net/wireless/realtek/rtw89/mac.h | 5 +- + drivers/net/wireless/realtek/rtw89/mac80211.c | 18 +- + drivers/net/wireless/realtek/rtw89/mac_be.c | 4 +- + drivers/net/wireless/realtek/rtw89/pci.c | 69 +- + drivers/net/wireless/realtek/rtw89/pci.h | 1 + + drivers/net/wireless/realtek/rtw89/phy.c | 46 +- + drivers/net/wireless/realtek/rtw89/phy.h | 72 ++ + drivers/net/wireless/realtek/rtw89/phy_be.c | 312 ++++++ + drivers/net/wireless/realtek/rtw89/reg.h | 278 +++++- + drivers/net/wireless/realtek/rtw89/rtw8851b.c | 15 +- + .../net/wireless/realtek/rtw89/rtw8851b_table.c | 72 +- + drivers/net/wireless/realtek/rtw89/rtw8852a.c | 11 +- + drivers/net/wireless/realtek/rtw89/rtw8852b.c | 15 +- + .../net/wireless/realtek/rtw89/rtw8852b_table.c | 142 +-- + drivers/net/wireless/realtek/rtw89/rtw8852c.c | 14 +- + drivers/net/wireless/realtek/rtw89/rtw8922a.c | 705 ++++++++++++- + drivers/net/wireless/realtek/rtw89/wow.c | 2 +- + drivers/ptp/Kconfig | 12 + + drivers/ptp/Makefile | 1 + + drivers/ptp/ptp_clock.c | 16 +- + drivers/ptp/ptp_fc3.c | 1016 +++++++++++++++++++ + drivers/ptp/ptp_fc3.h | 45 + + drivers/ptp/ptp_sysfs.c | 13 +- + include/linux/bpf.h | 142 ++- + include/linux/bpf_verifier.h | 3 +- + include/linux/btf.h | 13 + + include/linux/ethtool.h | 17 +- + include/linux/filter.h | 3 +- + include/linux/inet_diag.h | 1 + + include/linux/lsm_hook_defs.h | 15 +- + include/linux/mfd/idtRC38xxx_reg.h | 273 +++++ + include/linux/phy.h | 30 +- + include/linux/phylink.h | 4 +- + include/linux/ptp_clock_kernel.h | 3 + + include/linux/security.h | 43 +- + include/linux/sock_diag.h | 10 +- + include/linux/stmmac.h | 1 + + include/net/af_unix.h | 14 +- + include/net/dsa.h | 4 +- + include/net/ip6_fib.h | 6 - + include/net/netfilter/nf_tables.h | 6 + + include/net/request_sock.h | 39 + + include/net/scm.h | 1 + + include/net/sock.h | 25 - + include/net/tcp.h | 45 + + include/uapi/linux/bpf.h | 78 +- + include/uapi/linux/netfilter/nf_tables.h | 6 +- + include/uapi/linux/ptp_clock.h | 13 +- + kernel/bpf/Makefile | 2 +- + kernel/bpf/arraymap.c | 2 +- + kernel/bpf/bpf_lsm.c | 15 +- + kernel/bpf/bpf_struct_ops.c | 447 +++++---- + kernel/bpf/bpf_struct_ops_types.h | 12 - + kernel/bpf/btf.c | 276 ++++- + kernel/bpf/cgroup.c | 6 +- + kernel/bpf/core.c | 13 +- + kernel/bpf/helpers.c | 7 +- + kernel/bpf/inode.c | 276 ++++- + kernel/bpf/syscall.c | 234 +++-- + kernel/bpf/token.c | 278 ++++++ + kernel/bpf/verifier.c | 148 ++- + kernel/trace/bpf_trace.c | 17 +- + net/bpf/bpf_dummy_struct_ops.c | 22 +- + net/bridge/netfilter/Kconfig | 7 + + net/bridge/netfilter/Makefile | 2 +- + net/core/dev.c | 27 +- + net/core/dev.h | 1 + + net/core/filter.c | 155 ++- + net/core/scm.c | 5 + + net/core/sock.c | 14 +- + net/core/sock_diag.c | 120 ++- + net/core/xdp.c | 6 +- + net/dccp/diag.c | 1 + + net/dsa/user.c | 4 +- + net/ethtool/common.c | 5 + + net/ethtool/common.h | 1 + + net/ethtool/eee.c | 75 +- + net/ethtool/ioctl.c | 69 +- + net/ieee802154/6lowpan/core.c | 1 + + net/ieee802154/socket.c | 1 + + net/ipv4/bpf_tcp_ca.c | 22 +- + net/ipv4/inet_diag.c | 101 +- + net/ipv4/netfilter/Kconfig | 43 +- + net/ipv4/netfilter/Makefile | 2 +- + net/ipv4/raw_diag.c | 1 + + net/ipv4/syncookies.c | 40 +- + net/ipv4/tcp_diag.c | 1 + + net/ipv4/tcp_input.c | 18 +- + net/ipv4/udp_diag.c | 2 + + net/ipv6/ip6_fib.c | 19 +- + net/ipv6/netfilter/Kconfig | 20 +- + net/ipv6/netfilter/Makefile | 2 +- + net/ipv6/route.c | 8 +- + net/ipv6/syncookies.c | 13 +- + net/mptcp/mptcp_diag.c | 1 + + net/netfilter/Kconfig | 12 +- + net/netfilter/ipvs/ip_vs_conn.c | 4 +- + net/netfilter/nf_bpf_link.c | 2 +- + net/netfilter/nf_conncount.c | 8 +- + net/netfilter/nf_tables_api.c | 35 +- + net/netlink/diag.c | 1 + + net/packet/diag.c | 1 + + net/rds/connection.c | 4 +- + net/sched/sch_taprio.c | 72 +- + net/sctp/diag.c | 1 + + net/smc/smc_diag.c | 1 + + net/tipc/diag.c | 1 + + net/tipc/node.c | 2 - + net/tipc/socket.c | 1 - + net/unix/af_unix.c | 10 +- + net/unix/diag.c | 1 + + net/unix/garbage.c | 98 +- + net/unix/scm.c | 27 +- + net/vmw_vsock/diag.c | 1 + + net/xdp/xsk_diag.c | 1 + + rust/kernel/net/phy.rs | 24 +- + security/security.c | 101 +- + security/selinux/hooks.c | 47 +- + tools/bpf/bpftool/link.c | 96 +- + tools/bpf/bpftool/prog.c | 2 +- + tools/include/uapi/linux/bpf.h | 79 +- + tools/lib/bpf/Build | 2 +- + tools/lib/bpf/bpf.c | 42 +- + tools/lib/bpf/bpf.h | 38 +- + tools/lib/bpf/bpf_core_read.h | 2 +- + tools/lib/bpf/btf.c | 10 +- + tools/lib/bpf/elf.c | 2 - + tools/lib/bpf/features.c | 503 ++++++++++ + tools/lib/bpf/libbpf.c | 604 +++-------- + tools/lib/bpf/libbpf.h | 21 +- + tools/lib/bpf/libbpf.map | 1 + + tools/lib/bpf/libbpf_internal.h | 50 +- + tools/lib/bpf/libbpf_probes.c | 12 +- + tools/lib/bpf/str_error.h | 3 + + tools/testing/selftests/Makefile | 7 +- + tools/testing/selftests/bpf/README.rst | 32 +- + tools/testing/selftests/bpf/bpf_experimental.h | 21 +- + tools/testing/selftests/bpf/bpf_kfuncs.h | 10 + + .../selftests/bpf/bpf_testmod/bpf_testmod.c | 75 ++ + .../selftests/bpf/bpf_testmod/bpf_testmod.h | 5 + + tools/testing/selftests/bpf/config | 1 + + .../selftests/bpf/prog_tests/bpf_verif_scale.c | 2 +- + .../testing/selftests/bpf/prog_tests/ctx_rewrite.c | 44 - + .../selftests/bpf/prog_tests/fill_link_info.c | 114 ++- + .../selftests/bpf/prog_tests/kptr_xchg_inline.c | 51 + + .../selftests/bpf/prog_tests/libbpf_probes.c | 4 + + .../testing/selftests/bpf/prog_tests/libbpf_str.c | 6 + + .../testing/selftests/bpf/prog_tests/reg_bounds.c | 2 +- + .../testing/selftests/bpf/prog_tests/tc_redirect.c | 90 +- + .../bpf/prog_tests/tcp_custom_syncookie.c | 150 +++ + .../bpf/prog_tests/test_struct_ops_module.c | 75 ++ + tools/testing/selftests/bpf/prog_tests/token.c | 1052 ++++++++++++++++++++ + tools/testing/selftests/bpf/prog_tests/xdpwall.c | 2 +- + tools/testing/selftests/bpf/progs/bpf_misc.h | 2 +- + .../testing/selftests/bpf/progs/bpf_tracing_net.h | 16 + + tools/testing/selftests/bpf/progs/iters.c | 4 +- + .../testing/selftests/bpf/progs/kptr_xchg_inline.c | 48 + + tools/testing/selftests/bpf/progs/priv_map.c | 13 + + tools/testing/selftests/bpf/progs/priv_prog.c | 13 + + .../selftests/bpf/progs/struct_ops_module.c | 30 + + .../selftests/bpf/progs/test_core_reloc_type_id.c | 2 +- + .../selftests/bpf/progs/test_fill_link_info.c | 6 + + .../testing/selftests/bpf/progs/test_map_in_map.c | 26 + + tools/testing/selftests/bpf/progs/test_siphash.h | 64 ++ + .../bpf/progs/test_tcp_custom_syncookie.c | 572 +++++++++++ + .../bpf/progs/test_tcp_custom_syncookie.h | 140 +++ + .../testing/selftests/bpf/progs/test_tcpbpf_kern.c | 2 +- + .../testing/selftests/bpf/progs/test_xdp_dynptr.c | 10 +- + tools/testing/selftests/bpf/progs/token_lsm.c | 32 + + .../bpf/progs/verifier_direct_packet_access.c | 2 +- + .../testing/selftests/bpf/progs/verifier_loops1.c | 24 + + .../selftests/bpf/progs/verifier_spill_fill.c | 229 ++++- + tools/testing/selftests/bpf/test_loader.c | 4 +- + tools/testing/selftests/bpf/test_maps.c | 6 +- + tools/testing/selftests/bpf/test_progs.c | 18 - + tools/testing/selftests/bpf/test_sock_addr.c | 3 +- + tools/testing/selftests/bpf/test_verifier.c | 60 +- + tools/testing/selftests/bpf/testing_helpers.c | 92 +- + tools/testing/selftests/bpf/testing_helpers.h | 8 + + .../selftests/bpf/verifier/bpf_loop_inline.c | 6 + + tools/testing/selftests/bpf/verifier/precise.c | 6 +- + .../testing/selftests/drivers/net/bonding/Makefile | 7 +- + .../drivers/net/bonding/bond-eth-type-change.sh | 2 +- + .../drivers/net/bonding/bond_topo_2d1c.sh | 2 +- + .../drivers/net/bonding/dev_addr_lists.sh | 2 +- + .../drivers/net/bonding/mode-1-recovery-updelay.sh | 2 +- + .../drivers/net/bonding/mode-2-recovery-updelay.sh | 2 +- + .../drivers/net/bonding/net_forwarding_lib.sh | 1 - + tools/testing/selftests/drivers/net/dsa/Makefile | 18 +- + .../drivers/net/dsa/bridge_locked_port.sh | 2 +- + .../selftests/drivers/net/dsa/bridge_mdb.sh | 2 +- + .../selftests/drivers/net/dsa/bridge_mld.sh | 2 +- + .../selftests/drivers/net/dsa/bridge_vlan_aware.sh | 2 +- + .../selftests/drivers/net/dsa/bridge_vlan_mcast.sh | 2 +- + .../drivers/net/dsa/bridge_vlan_unaware.sh | 2 +- + tools/testing/selftests/drivers/net/dsa/lib.sh | 1 - + .../selftests/drivers/net/dsa/local_termination.sh | 2 +- + .../selftests/drivers/net/dsa/no_forwarding.sh | 2 +- + .../drivers/net/dsa/run_net_forwarding_test.sh | 9 + + .../selftests/drivers/net/dsa/tc_actions.sh | 2 +- + .../testing/selftests/drivers/net/dsa/tc_common.sh | 1 - + .../drivers/net/dsa/test_bridge_fdb_stress.sh | 2 +- + tools/testing/selftests/drivers/net/team/Makefile | 7 +- + .../selftests/drivers/net/team/dev_addr_lists.sh | 4 +- + .../testing/selftests/drivers/net/team/lag_lib.sh | 1 - + .../drivers/net/team/net_forwarding_lib.sh | 1 - + tools/testing/selftests/lib.mk | 19 + + tools/testing/selftests/net/fcnal-test.sh | 25 +- + tools/testing/selftests/net/forwarding/Makefile | 3 + + tools/testing/selftests/net/forwarding/config | 28 + + tools/testing/selftests/net/forwarding/lib.sh | 37 +- + .../selftests/net/forwarding/mirror_gre_lib.sh | 2 +- + .../net/forwarding/mirror_gre_topo_lib.sh | 2 +- + tools/testing/selftests/net/fq_band_pktlimit.sh | 14 +- + tools/testing/selftests/net/txtimestamp.sh | 12 +- + tools/testing/selftests/tc-testing/config | 1 + + .../selftests/tc-testing/tc-tests/qdiscs/fq.json | 2 +- + .../tc-testing/tc-tests/qdiscs/taprio.json | 2 + + tools/testing/selftests/tc-testing/tdc.py | 2 +- + tools/testing/selftests/tc-testing/tdc.sh | 3 +- + tools/testing/vsock/util.c | 17 +- + tools/testing/vsock/util.h | 4 + + tools/testing/vsock/vsock_diag_test.c | 23 +- + tools/testing/vsock/vsock_test.c | 102 +- + tools/testing/vsock/vsock_test_zerocopy.c | 12 +- + tools/testing/vsock/vsock_uring_test.c | 17 +- + 436 files changed, 16458 insertions(+), 4928 deletions(-) + create mode 100644 Documentation/devicetree/bindings/net/qca,qca808x.yaml + create mode 100644 drivers/net/ethernet/wangxun/txgbe/txgbe_irq.c + create mode 100644 drivers/net/ethernet/wangxun/txgbe/txgbe_irq.h + create mode 100644 drivers/ptp/ptp_fc3.c + create mode 100644 drivers/ptp/ptp_fc3.h + create mode 100644 include/linux/mfd/idtRC38xxx_reg.h + delete mode 100644 kernel/bpf/bpf_struct_ops_types.h + create mode 100644 kernel/bpf/token.c + create mode 100644 tools/lib/bpf/features.c + create mode 100644 tools/testing/selftests/bpf/prog_tests/kptr_xchg_inline.c + create mode 100644 tools/testing/selftests/bpf/prog_tests/tcp_custom_syncookie.c + create mode 100644 tools/testing/selftests/bpf/prog_tests/test_struct_ops_module.c + create mode 100644 tools/testing/selftests/bpf/prog_tests/token.c + create mode 100644 tools/testing/selftests/bpf/progs/kptr_xchg_inline.c + create mode 100644 tools/testing/selftests/bpf/progs/priv_map.c + create mode 100644 tools/testing/selftests/bpf/progs/priv_prog.c + create mode 100644 tools/testing/selftests/bpf/progs/struct_ops_module.c + create mode 100644 tools/testing/selftests/bpf/progs/test_siphash.h + create mode 100644 tools/testing/selftests/bpf/progs/test_tcp_custom_syncookie.c + create mode 100644 tools/testing/selftests/bpf/progs/test_tcp_custom_syncookie.h + create mode 100644 tools/testing/selftests/bpf/progs/token_lsm.c + delete mode 120000 tools/testing/selftests/drivers/net/bonding/net_forwarding_lib.sh + delete mode 120000 tools/testing/selftests/drivers/net/dsa/lib.sh + create mode 100755 tools/testing/selftests/drivers/net/dsa/run_net_forwarding_test.sh + delete mode 120000 tools/testing/selftests/drivers/net/dsa/tc_common.sh + delete mode 120000 tools/testing/selftests/drivers/net/team/lag_lib.sh + delete mode 120000 tools/testing/selftests/drivers/net/team/net_forwarding_lib.sh +Merging bpf-next/for-next (cd1c194ffe28 Merge branch 'annotate-kfuncs-in-btf_ids-section') +$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git bpf-next/for-next +Auto-merging drivers/hid/bpf/hid_bpf_dispatch.c +Auto-merging net/core/xdp.c +Merge made by the 'ort' strategy. + Documentation/bpf/kfuncs.rst | 8 +- + .../bpf/standardization/instruction-set.rst | 13 +- + arch/riscv/net/bpf_jit.h | 134 +++++++++++++ + arch/riscv/net/bpf_jit_comp64.c | 210 +++++++-------------- + drivers/hid/bpf/hid_bpf_dispatch.c | 8 +- + fs/verity/measure.c | 4 +- + include/linux/bpf.h | 1 - + include/linux/bpf_verifier.h | 1 + + include/linux/btf_ids.h | 21 ++- + kernel/bpf/btf.c | 138 +++++++++++--- + kernel/bpf/cpumask.c | 4 +- + kernel/bpf/helpers.c | 8 +- + kernel/bpf/map_iter.c | 4 +- + kernel/bpf/token.c | 16 +- + kernel/bpf/verifier.c | 24 +++ + kernel/cgroup/rstat.c | 4 +- + kernel/events/core.c | 6 +- + kernel/trace/bpf_trace.c | 8 +- + net/bpf/test_run.c | 8 +- + net/core/filter.c | 20 +- + net/core/xdp.c | 4 +- + net/ipv4/bpf_tcp_ca.c | 4 +- + net/ipv4/fou_bpf.c | 4 +- + net/ipv4/tcp_bbr.c | 4 +- + net/ipv4/tcp_cubic.c | 4 +- + net/ipv4/tcp_dctcp.c | 4 +- + net/netfilter/nf_conntrack_bpf.c | 4 +- + net/netfilter/nf_nat_bpf.c | 4 +- + net/xfrm/xfrm_interface_bpf.c | 4 +- + net/xfrm/xfrm_state_bpf.c | 4 +- + scripts/bpf_doc.py | 2 +- + tools/bpf/bpftool/gen.c | 9 +- + tools/lib/bpf/bpf_core_read.h | 13 ++ + tools/lib/bpf/bpf_helpers.h | 2 + + tools/lib/bpf/btf.c | 22 ++- + tools/lib/bpf/features.c | 58 ++++++ + tools/lib/bpf/libbpf.c | 86 +++------ + tools/lib/bpf/libbpf_internal.h | 16 ++ + tools/testing/selftests/bpf/Makefile | 31 ++- + tools/testing/selftests/bpf/bpf_kfuncs.h | 2 +- + .../selftests/bpf/bpf_testmod/bpf_testmod.c | 10 +- + .../selftests/bpf/prog_tests/decap_sanity.c | 2 +- + .../testing/selftests/bpf/prog_tests/fib_lookup.c | 2 +- + .../selftests/bpf/prog_tests/ip_check_defrag.c | 4 +- + .../selftests/bpf/prog_tests/lwt_redirect.c | 3 +- + .../testing/selftests/bpf/prog_tests/lwt_reroute.c | 2 +- + tools/testing/selftests/bpf/prog_tests/mptcp.c | 2 +- + .../selftests/bpf/prog_tests/sock_destroy.c | 2 +- + .../selftests/bpf/prog_tests/sock_iter_batch.c | 4 +- + .../testing/selftests/bpf/prog_tests/test_tunnel.c | 18 +- + tools/testing/selftests/bpf/prog_tests/verifier.c | 2 + + .../selftests/bpf/progs/connect_unix_prog.c | 3 +- + .../selftests/bpf/progs/getpeername_unix_prog.c | 3 +- + .../selftests/bpf/progs/getsockname_unix_prog.c | 3 +- + .../selftests/bpf/progs/recvmsg_unix_prog.c | 3 +- + .../selftests/bpf/progs/sendmsg_unix_prog.c | 3 +- + .../selftests/bpf/progs/sk_storage_omem_uncharge.c | 4 +- + .../testing/selftests/bpf/progs/sock_iter_batch.c | 4 +- + tools/testing/selftests/bpf/progs/type_cast.c | 13 +- + .../selftests/bpf/progs/verifier_global_ptr_args.c | 156 +++++++++++++++ + tools/testing/selftests/bpf/test_progs.h | 7 +- + 61 files changed, 791 insertions(+), 380 deletions(-) + create mode 100644 tools/testing/selftests/bpf/progs/verifier_global_ptr_args.c +Merging ipsec-next/master (ab1e1a38de24 xfrm6_tunnel: Use KMEM_CACHE instead of kmem_cache_create) +$ git merge -m Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/klassert/ipsec-next.git ipsec-next/master +Auto-merging net/xfrm/xfrm_policy.c +Merge made by the 'ort' strategy. + net/ipv6/xfrm6_tunnel.c | 5 +- + net/xfrm/xfrm_policy.c | 142 +++++++++++++++++++++++++++++++++++++++++++++++- + 2 files changed, 141 insertions(+), 6 deletions(-) +Merging mlx5-next/mlx5-next (d727d27db536 RDMA/mlx5: Expose register c0 for RDMA device) +$ git merge -m Merge branch 'mlx5-next' of git://git.kernel.org/pub/scm/linux/kernel/git/mellanox/linux.git mlx5-next/mlx5-next +Already up to date. +Merging netfilter-next/main (5264ab612e28 selftests/net: calibrate txtimestamp) +$ git merge -m Merge branch 'main' of git://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf-next.git netfilter-next/main +Already up to date. +Merging ipvs-next/main (7ad269787b66 netfilter: ebtables: allow xtables-nft only builds) +$ git merge -m Merge branch 'main' of git://git.kernel.org/pub/scm/linux/kernel/git/horms/ipvs-next.git ipvs-next/main +Already up to date. +Merging bluetooth/master (64692e12507b Bluetooth: qca: Fix wrong event type for patch config command) +$ git merge -m Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/bluetooth/bluetooth-next.git bluetooth/master +Auto-merging drivers/bluetooth/btnxpuart.c +Auto-merging drivers/bluetooth/btusb.c +Merge made by the 'ort' strategy. + drivers/bluetooth/btintel.c | 2 +- + drivers/bluetooth/btnxpuart.c | 24 ++++++- + drivers/bluetooth/btqca.c | 2 +- + drivers/bluetooth/btrtl.c | 14 ++++ + drivers/bluetooth/btusb.c | 8 +++ + drivers/bluetooth/hci_bcm4377.c | 3 +- + include/net/bluetooth/hci.h | 4 +- + include/net/bluetooth/hci_sync.h | 2 +- + net/bluetooth/hci_core.c | 135 ++++++++++++++++++++++++++++----------- + net/bluetooth/hci_event.c | 23 ++++--- + net/bluetooth/hci_request.c | 2 +- + net/bluetooth/hci_sock.c | 4 +- + net/bluetooth/hci_sync.c | 57 ++++++++++------- + net/bluetooth/l2cap_core.c | 8 ++- + net/bluetooth/mgmt.c | 36 +++++------ + net/bluetooth/rfcomm/core.c | 2 +- + 16 files changed, 226 insertions(+), 100 deletions(-) +Merging wireless-next/for-next (3fbf61207c66 Revert "mlx5 updates 2023-12-20") +$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/wireless/wireless-next.git wireless-next/for-next +Already up to date. +Merging wpan-next/master (2373699560a7 mac802154: Avoid new associations while disassociating) +$ git merge -m Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/wpan/wpan-next.git wpan-next/master +Already up to date. +Merging wpan-staging/staging (2373699560a7 mac802154: Avoid new associations while disassociating) +$ git merge -m Merge branch 'staging' of git://git.kernel.org/pub/scm/linux/kernel/git/wpan/wpan-next.git wpan-staging/staging +Already up to date. +Merging mtd/mtd/next (98d4fda8f2d4 Merge tag 'nand/for-6.8' into mtd/next) +$ git merge -m Merge branch 'mtd/next' of git://git.kernel.org/pub/scm/linux/kernel/git/mtd/linux.git mtd/mtd/next +Already up to date. +Merging nand/nand/next (023e6aad7e5e mtd: rawnand: s3c2410: fix Excess struct member description kernel-doc warnings) +$ git merge -m Merge branch 'nand/next' of git://git.kernel.org/pub/scm/linux/kernel/git/mtd/linux.git nand/nand/next +Already up to date. +Merging spi-nor/spi-nor/next (3c0e1dfa703c MAINTAINERS: change my mail to the kernel.org one) +$ git merge -m Merge branch 'spi-nor/next' of git://git.kernel.org/pub/scm/linux/kernel/git/mtd/linux.git spi-nor/spi-nor/next +Already up to date. +Merging crypto/master (4d314d27130b dt-bindings: crypto: ice: Document SC7180 inline crypto engine) +$ git merge -m Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert/cryptodev-2.6.git crypto/master +Merge made by the 'ort' strategy. + Documentation/ABI/testing/debugfs-hisi-hpre | 7 + + Documentation/ABI/testing/debugfs-hisi-sec | 7 + + Documentation/ABI/testing/debugfs-hisi-zip | 7 + + .../bindings/crypto/qcom,inline-crypto-engine.yaml | 1 + + .../devicetree/bindings/crypto/qcom-qce.yaml | 1 + + arch/arm64/crypto/Kconfig | 1 + + arch/arm64/crypto/aes-ce-ccm-core.S | 265 ++++++++------------- + arch/arm64/crypto/aes-ce-ccm-glue.c | 154 ++++++++---- + arch/arm64/crypto/aes-glue.c | 1 + + arch/powerpc/crypto/Kconfig | 20 ++ + arch/powerpc/crypto/Makefile | 20 +- + {drivers/crypto/vmx => arch/powerpc/crypto}/aes.c | 0 + .../crypto/vmx => arch/powerpc/crypto}/aes_cbc.c | 0 + .../crypto/vmx => arch/powerpc/crypto}/aes_ctr.c | 0 + .../crypto/vmx => arch/powerpc/crypto}/aes_xts.c | 0 + .../crypto/vmx => arch/powerpc/crypto}/aesp8-ppc.h | 0 + .../vmx => arch/powerpc/crypto}/aesp8-ppc.pl | 0 + .../crypto/vmx => arch/powerpc/crypto}/ghash.c | 0 + .../vmx => arch/powerpc/crypto}/ghashp8-ppc.pl | 0 + {drivers/crypto/vmx => arch/powerpc/crypto}/vmx.c | 0 + crypto/asymmetric_keys/verify_pefile.c | 4 +- + crypto/pcbc.c | 4 +- + crypto/testmgr.c | 8 - + drivers/crypto/Kconfig | 14 +- + drivers/crypto/Makefile | 2 +- + drivers/crypto/allwinner/sun8i-ce/sun8i-ce-hash.c | 2 +- + drivers/crypto/hisilicon/debugfs.c | 53 +++++ + drivers/crypto/hisilicon/hpre/hpre_main.c | 2 +- + drivers/crypto/hisilicon/sec2/sec_main.c | 2 +- + drivers/crypto/hisilicon/zip/zip_main.c | 2 +- + drivers/crypto/intel/iaa/iaa_crypto.h | 25 -- + drivers/crypto/intel/iaa/iaa_crypto_comp_fixed.c | 1 - + drivers/crypto/intel/iaa/iaa_crypto_main.c | 108 +-------- + drivers/crypto/intel/iaa/iaa_crypto_stats.c | 2 - + .../crypto/intel/qat/qat_common/adf_gen4_hw_data.c | 3 + + drivers/crypto/intel/qat/qat_common/adf_isr.c | 2 +- + .../crypto/virtio/virtio_crypto_akcipher_algs.c | 12 +- + drivers/crypto/vmx/.gitignore | 3 - + drivers/crypto/vmx/Kconfig | 14 -- + drivers/crypto/vmx/Makefile | 23 -- + drivers/crypto/vmx/ppc-xlate.pl | 231 ------------------ + include/crypto/public_key.h | 1 + + 42 files changed, 344 insertions(+), 658 deletions(-) + rename {drivers/crypto/vmx => arch/powerpc/crypto}/aes.c (100%) + rename {drivers/crypto/vmx => arch/powerpc/crypto}/aes_cbc.c (100%) + rename {drivers/crypto/vmx => arch/powerpc/crypto}/aes_ctr.c (100%) + rename {drivers/crypto/vmx => arch/powerpc/crypto}/aes_xts.c (100%) + rename {drivers/crypto/vmx => arch/powerpc/crypto}/aesp8-ppc.h (100%) + rename {drivers/crypto/vmx => arch/powerpc/crypto}/aesp8-ppc.pl (100%) + rename {drivers/crypto/vmx => arch/powerpc/crypto}/ghash.c (100%) + rename {drivers/crypto/vmx => arch/powerpc/crypto}/ghashp8-ppc.pl (100%) + rename {drivers/crypto/vmx => arch/powerpc/crypto}/vmx.c (100%) + delete mode 100644 drivers/crypto/vmx/.gitignore + delete mode 100644 drivers/crypto/vmx/Kconfig + delete mode 100644 drivers/crypto/vmx/Makefile + delete mode 100644 drivers/crypto/vmx/ppc-xlate.pl +Merging drm/drm-next (41bccc98fb79 Linux 6.8-rc2) +$ git merge -m Merge branch 'drm-next' of git://git.freedesktop.org/git/drm/drm.git drm/drm-next +Already up to date. +Merging drm-ci/topic/drm-ci (ad6bfe1b66a5 drm: ci: docs: fix build warning - add missing escape) +$ git merge -m Merge branch 'topic/drm-ci' of git://git.freedesktop.org/git/drm/drm.git drm-ci/topic/drm-ci +Already up to date. +Merging drm-exynos/for-linux-next (6613476e225e Linux 6.8-rc1) +$ git merge -m Merge branch 'for-linux-next' of git://git.kernel.org/pub/scm/linux/kernel/git/daeinki/drm-exynos.git drm-exynos/for-linux-next +Already up to date. +Merging drm-misc/for-linux-next (1f1626ac0428 drm/ttm: fix ttm pool initialization for no-dma-device drivers) +$ git merge -m Merge branch 'for-linux-next' of git://anongit.freedesktop.org/drm/drm-misc drm-misc/for-linux-next +Already up to date. +Merging amdgpu/drm-next (9217b91c6458 drm/amdgpu: Reset IH OVERFLOW_CLEAR bit) +$ git merge -m Merge branch 'drm-next' of https://gitlab.freedesktop.org/agd5f/linux amdgpu/drm-next +Auto-merging drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c +CONFLICT (content): Merge conflict in drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c +Auto-merging drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +Auto-merging drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +Auto-merging drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c +Auto-merging drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c +Auto-merging drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c +Auto-merging drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c +Auto-merging drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +Auto-merging drivers/gpu/drm/amd/display/dc/core/dc.c +Auto-merging drivers/gpu/drm/amd/display/dc/core/dc_resource.c +Auto-merging drivers/gpu/drm/amd/display/dc/dc.h +CONFLICT (content): Merge conflict in drivers/gpu/drm/amd/display/dc/dc.h +Auto-merging drivers/gpu/drm/amd/display/dc/dml2/dml2_translation_helper.c +Auto-merging drivers/gpu/drm/amd/display/dc/link/link_dpms.c +Auto-merging drivers/gpu/drm/amd/display/dmub/inc/dmub_cmd.h +Auto-merging drivers/gpu/drm/amd/include/amd_shared.h +Auto-merging drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c +Auto-merging drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c +Auto-merging drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c +Auto-merging drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c +Auto-merging drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c +Auto-merging drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c +Resolved 'drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c' using previous resolution. +Resolved 'drivers/gpu/drm/amd/display/dc/dc.h' using previous resolution. +Automatic merge failed; fix conflicts and then commit the result. +$ git commit --no-edit -v -a +[master ea89d33405ba] Merge branch 'drm-next' of https://gitlab.freedesktop.org/agd5f/linux +$ git diff -M --stat --summary HEAD^.. + Documentation/gpu/amdgpu/dgpu-asic-info-table.csv | 2 + + Documentation/gpu/amdgpu/display/dcn-blocks.rst | 78 ++ + .../gpu/amdgpu/display/display-contributing.rst | 168 ++++ + .../gpu/amdgpu/display/display-manager.rst | 3 - + Documentation/gpu/amdgpu/display/index.rst | 78 +- + drivers/gpu/drm/amd/amdgpu/Makefile | 2 +- + drivers/gpu/drm/amd/amdgpu/amdgpu.h | 14 +- + drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c | 879 +++++++++++++++++++++ + drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h | 202 +++++ + drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 37 +- + drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 16 +- + .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c | 2 +- + drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 59 +- + drivers/gpu/drm/amd/amdgpu/amdgpu_atombios.c | 24 +- + drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c | 3 +- + drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.h | 2 +- + drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 6 +- + drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c | 2 +- + drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 8 +- + drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 80 +- + drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c | 6 +- + drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c | 4 + + drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 8 +- + drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 35 +- + drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 4 +- + drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 55 +- + drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h | 4 + + drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c | 7 +- + drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 17 +- + drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c | 33 +- + drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h | 1 + + drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 2 +- + drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 186 +++-- + drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h | 9 +- + drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 689 ++++++++++++---- + drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 60 +- + drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c | 12 + + drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h | 2 +- + drivers/gpu/drm/amd/amdgpu/amdgpu_rlc.c | 2 +- + drivers/gpu/drm/amd/amdgpu/amdgpu_rlc.h | 2 +- + drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c | 70 +- + drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.h | 9 +- + drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 155 +++- + drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h | 10 +- + drivers/gpu/drm/amd/amdgpu/amdgpu_umr.h | 4 +- + drivers/gpu/drm/amd/amdgpu/amdgpu_umsch_mm.c | 2 +- + drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 2 +- + drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 83 +- + drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 7 +- + drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c | 3 +- + drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 69 +- + drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 18 +- + drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 81 +- + drivers/gpu/drm/amd/amdgpu/atom.c | 41 +- + drivers/gpu/drm/amd/amdgpu/atom.h | 2 +- + drivers/gpu/drm/amd/amdgpu/atombios_crtc.c | 28 +- + drivers/gpu/drm/amd/amdgpu/atombios_dp.c | 4 +- + drivers/gpu/drm/amd/amdgpu/atombios_encoders.c | 16 +- + drivers/gpu/drm/amd/amdgpu/atombios_i2c.c | 4 +- + drivers/gpu/drm/amd/amdgpu/cik_ih.c | 6 + + drivers/gpu/drm/amd/amdgpu/clearstate_gfx9.h | 27 +- + drivers/gpu/drm/amd/amdgpu/clearstate_si.h | 24 +- + drivers/gpu/drm/amd/amdgpu/cz_ih.c | 5 + + drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 4 +- + drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 38 +- + drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c | 2 +- + drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c | 4 +- + drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 4 +- + drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 2 +- + drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c | 5 +- + drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 92 ++- + drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 9 +- + drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c | 9 +- + drivers/gpu/drm/amd/amdgpu/gmc_v6_0.c | 9 +- + drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c | 9 +- + drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c | 9 +- + drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 16 +- + drivers/gpu/drm/amd/amdgpu/iceland_ih.c | 5 + + drivers/gpu/drm/amd/amdgpu/ih_v6_0.c | 6 + + drivers/gpu/drm/amd/amdgpu/ih_v6_1.c | 7 + + drivers/gpu/drm/amd/amdgpu/jpeg_v2_5.c | 10 +- + drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c | 87 ++ + drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 3 +- + drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 29 +- + drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h | 1 + + drivers/gpu/drm/amd/amdgpu/navi10_ih.c | 9 +- + drivers/gpu/drm/amd/amdgpu/psp_v13_0.c | 99 +-- + drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c | 15 +- + drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 72 ++ + drivers/gpu/drm/amd/amdgpu/si_ih.c | 6 + + drivers/gpu/drm/amd/amdgpu/ta_ras_if.h | 36 + + drivers/gpu/drm/amd/amdgpu/tonga_ih.c | 6 + + drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 252 ++++-- + drivers/gpu/drm/amd/amdgpu/umc_v12_0.h | 3 + + drivers/gpu/drm/amd/amdgpu/umc_v6_0.c | 2 +- + drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c | 17 - + drivers/gpu/drm/amd/amdgpu/vcn_v4_0_5.c | 19 - + drivers/gpu/drm/amd/amdgpu/vega10_ih.c | 6 + + drivers/gpu/drm/amd/amdgpu/vega20_ih.c | 6 + + drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h | 14 +- + .../gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm | 2 +- + .../gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm | 2 +- + drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 4 +- + drivers/gpu/drm/amd/amdkfd/kfd_debug.c | 4 +- + drivers/gpu/drm/amd/amdkfd/kfd_events.c | 6 +- + drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c | 7 +- + drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c | 7 +- + drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 7 +- + drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 2 +- + drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 9 +- + drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 10 +- + drivers/gpu/drm/amd/display/TODO | 110 --- + drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 65 +- + drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h | 1 + + .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crtc.c | 72 +- + .../drm/amd/display/amdgpu_dm/amdgpu_dm_debugfs.c | 55 +- + .../drm/amd/display/amdgpu_dm/amdgpu_dm_replay.c | 119 +-- + .../drm/amd/display/amdgpu_dm/amdgpu_dm_replay.h | 4 +- + drivers/gpu/drm/amd/display/dc/basics/conversion.c | 34 + + drivers/gpu/drm/amd/display/dc/basics/conversion.h | 4 + + .../gpu/drm/amd/display/dc/bios/command_table.c | 2 +- + .../gpu/drm/amd/display/dc/bios/command_table2.c | 2 +- + .../amd/display/dc/clk_mgr/dcn32/dcn32_clk_mgr.c | 40 +- + .../amd/display/dc/clk_mgr/dcn35/dcn35_clk_mgr.c | 72 +- + .../drm/amd/display/dc/clk_mgr/dcn35/dcn35_smu.c | 15 + + drivers/gpu/drm/amd/display/dc/core/dc.c | 45 +- + drivers/gpu/drm/amd/display/dc/core/dc_resource.c | 14 + + drivers/gpu/drm/amd/display/dc/dc.h | 5 +- + drivers/gpu/drm/amd/display/dc/dc_dmub_srv.c | 14 +- + drivers/gpu/drm/amd/display/dc/dc_hw_types.h | 1 + + drivers/gpu/drm/amd/display/dc/dc_stream.h | 2 - + drivers/gpu/drm/amd/display/dc/dce/dce_audio.c | 299 ++++++- + drivers/gpu/drm/amd/display/dc/dce/dce_audio.h | 3 +- + drivers/gpu/drm/amd/display/dc/dce/dmub_replay.c | 4 +- + .../gpu/drm/amd/display/dc/dcn10/dcn10_cm_common.c | 20 + + .../gpu/drm/amd/display/dc/dcn10/dcn10_cm_common.h | 4 +- + drivers/gpu/drm/amd/display/dc/dcn10/dcn10_dpp.c | 3 +- + drivers/gpu/drm/amd/display/dc/dcn10/dcn10_dpp.h | 3 + + .../gpu/drm/amd/display/dc/dcn10/dcn10_dpp_cm.c | 70 +- + drivers/gpu/drm/amd/display/dc/dcn20/dcn20_dpp.c | 31 +- + drivers/gpu/drm/amd/display/dc/dcn20/dcn20_dpp.h | 3 + + .../gpu/drm/amd/display/dc/dcn20/dcn20_dpp_cm.c | 55 ++ + drivers/gpu/drm/amd/display/dc/dcn20/dcn20_mpc.c | 24 +- + drivers/gpu/drm/amd/display/dc/dcn201/dcn201_dpp.c | 1 + + drivers/gpu/drm/amd/display/dc/dcn30/dcn30_dpp.c | 38 +- + drivers/gpu/drm/amd/display/dc/dcn30/dcn30_dpp.h | 2 + + .../gpu/drm/amd/display/dc/dcn30/dcn30_dpp_cm.c | 54 ++ + drivers/gpu/drm/amd/display/dc/dcn30/dcn30_mpc.c | 106 ++- + drivers/gpu/drm/amd/display/dc/dcn30/dcn30_mpc.h | 4 + + .../amd/display/dc/dcn32/dcn32_dio_link_encoder.c | 4 +- + drivers/gpu/drm/amd/display/dc/dcn32/dcn32_dpp.c | 1 + + .../amd/display/dc/dcn32/dcn32_resource_helpers.c | 14 - + .../amd/display/dc/dcn35/dcn35_dio_link_encoder.c | 4 +- + .../amd/display/dc/dml/dcn30/display_mode_vba_30.c | 16 +- + .../gpu/drm/amd/display/dc/dml/dcn303/dcn303_fpu.c | 11 + + .../gpu/drm/amd/display/dc/dml/dcn32/dcn32_fpu.c | 15 +- + .../gpu/drm/amd/display/dc/dml/dcn35/dcn35_fpu.c | 8 +- + .../amd/display/dc/dml2/dml2_dc_resource_mgmt.c | 41 +- + .../amd/display/dc/dml2/dml2_translation_helper.c | 31 +- + drivers/gpu/drm/amd/display/dc/dsc/dc_dsc.c | 5 + + .../drm/amd/display/dc/hwss/dce110/dce110_hwseq.c | 58 +- + .../drm/amd/display/dc/hwss/dcn10/dcn10_hwseq.c | 99 ++- + .../drm/amd/display/dc/hwss/dcn20/dcn20_hwseq.c | 126 ++- + .../drm/amd/display/dc/hwss/dcn20/dcn20_hwseq.h | 2 + + .../drm/amd/display/dc/hwss/dcn30/dcn30_hwseq.c | 167 +++- + .../drm/amd/display/dc/hwss/dcn30/dcn30_hwseq.h | 6 +- + .../gpu/drm/amd/display/dc/hwss/dcn30/dcn30_init.c | 2 +- + .../drm/amd/display/dc/hwss/dcn31/dcn31_hwseq.c | 18 + + .../drm/amd/display/dc/hwss/dcn31/dcn31_hwseq.h | 4 + + .../gpu/drm/amd/display/dc/hwss/dcn31/dcn31_init.c | 2 +- + .../drm/amd/display/dc/hwss/dcn314/dcn314_init.c | 2 +- + .../gpu/drm/amd/display/dc/hwss/dcn32/dcn32_init.c | 2 +- + .../gpu/drm/amd/display/dc/hwss/dcn35/dcn35_init.c | 2 +- + .../drm/amd/display/dc/hwss/dcn351/dcn351_init.c | 2 +- + drivers/gpu/drm/amd/display/dc/hwss/hw_sequencer.h | 2 + + drivers/gpu/drm/amd/display/dc/inc/core_types.h | 2 + + drivers/gpu/drm/amd/display/dc/inc/hw/audio.h | 3 +- + .../drm/amd/display/dc/inc/hw/clk_mgr_internal.h | 6 + + drivers/gpu/drm/amd/display/dc/inc/hw/dchubbub.h | 6 + + drivers/gpu/drm/amd/display/dc/inc/hw/dpp.h | 39 + + drivers/gpu/drm/amd/display/dc/inc/hw/hubp.h | 13 +- + drivers/gpu/drm/amd/display/dc/inc/hw/mpc.h | 253 ++++-- + drivers/gpu/drm/amd/display/dc/inc/hw/opp.h | 16 + + drivers/gpu/drm/amd/display/dc/inc/resource.h | 3 + + .../drm/amd/display/dc/link/hwss/link_hwss_dio.h | 10 + + .../gpu/drm/amd/display/dc/link/link_detection.c | 18 + + drivers/gpu/drm/amd/display/dc/link/link_dpms.c | 58 ++ + .../display/dc/link/protocols/link_dp_dpia_bw.c | 2 +- + .../amd/display/dc/resource/dcn30/dcn30_resource.c | 11 + + .../amd/display/dc/resource/dcn32/dcn32_resource.c | 2 +- + .../amd/display/dc/resource/dcn32/dcn32_resource.h | 3 - + .../display/dc/resource/dcn321/dcn321_resource.c | 3 +- + .../amd/display/dc/resource/dcn35/dcn35_resource.c | 3 +- + drivers/gpu/drm/amd/display/dmub/dmub_srv.h | 15 +- + drivers/gpu/drm/amd/display/dmub/inc/dmub_cmd.h | 19 +- + drivers/gpu/drm/amd/display/dmub/src/dmub_srv.c | 121 ++- + drivers/gpu/drm/amd/display/include/audio_types.h | 15 + + drivers/gpu/drm/amd/include/amd_shared.h | 1 + + drivers/gpu/drm/amd/include/arct_ip_offset.h | 6 +- + .../amd/include/asic_reg/dcn/dcn_3_1_6_offset.h | 4 + + .../amd/include/asic_reg/dcn/dcn_3_1_6_sh_mask.h | 10 + + .../amd/include/asic_reg/dcn/dcn_3_5_0_offset.h | 24 + + .../amd/include/asic_reg/dcn/dcn_3_5_0_sh_mask.h | 65 ++ + drivers/gpu/drm/amd/include/atom-bits.h | 2 +- + drivers/gpu/drm/amd/include/beige_goby_ip_offset.h | 6 +- + drivers/gpu/drm/amd/include/cgs_common.h | 23 +- + .../gpu/drm/amd/include/cyan_skillfish_ip_offset.h | 6 +- + .../drm/amd/include/dimgrey_cavefish_ip_offset.h | 6 +- + drivers/gpu/drm/amd/include/dm_pp_interface.h | 9 +- + drivers/gpu/drm/amd/include/kgd_pp_interface.h | 6 +- + drivers/gpu/drm/amd/include/navi12_ip_offset.h | 6 +- + drivers/gpu/drm/amd/include/navi14_ip_offset.h | 6 +- + drivers/gpu/drm/amd/include/pptable.h | 6 +- + drivers/gpu/drm/amd/include/renoir_ip_offset.h | 6 +- + .../gpu/drm/amd/include/sienna_cichlid_ip_offset.h | 6 +- + drivers/gpu/drm/amd/include/v10_structs.h | 3 +- + drivers/gpu/drm/amd/include/vangogh_ip_offset.h | 6 +- + drivers/gpu/drm/amd/include/vega10_ip_offset.h | 6 +- + drivers/gpu/drm/amd/include/vega20_ip_offset.h | 78 +- + .../gpu/drm/amd/pm/powerplay/hwmgr/ppatomctrl.c | 42 +- + .../gpu/drm/amd/pm/powerplay/hwmgr/ppatomfwctrl.c | 4 +- + drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 33 +- + drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h | 1 - + drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c | 16 +- + drivers/gpu/drm/amd/pm/swsmu/smu12/smu_v12_0.c | 2 +- + drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c | 18 +- + .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c | 8 +- + .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c | 158 +++- + .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c | 8 +- + drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0.c | 2 +- + drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c | 9 +- + drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h | 10 + + drivers/gpu/drm/radeon/atom-bits.h | 2 +- + drivers/gpu/drm/radeon/atom.c | 47 +- + drivers/gpu/drm/radeon/atom.h | 4 +- + drivers/gpu/drm/radeon/atombios_crtc.c | 28 +- + drivers/gpu/drm/radeon/atombios_dp.c | 4 +- + drivers/gpu/drm/radeon/atombios_encoders.c | 38 +- + drivers/gpu/drm/radeon/atombios_i2c.c | 2 +- + drivers/gpu/drm/radeon/btc_dpm.c | 90 +-- + drivers/gpu/drm/radeon/ci_dpm.c | 31 +- + drivers/gpu/drm/radeon/ci_dpm.h | 6 +- + drivers/gpu/drm/radeon/clearstate_cayman.h | 9 +- + drivers/gpu/drm/radeon/clearstate_ci.h | 3 +- + drivers/gpu/drm/radeon/evergreen.c | 20 +- + drivers/gpu/drm/radeon/evergreen_cs.c | 4 +- + drivers/gpu/drm/radeon/evergreen_reg.h | 10 +- + drivers/gpu/drm/radeon/evergreen_smc.h | 9 +- + drivers/gpu/drm/radeon/kv_dpm.c | 9 +- + drivers/gpu/drm/radeon/kv_smc.c | 2 +- + drivers/gpu/drm/radeon/ni.c | 31 +- + drivers/gpu/drm/radeon/ni_dpm.c | 3 - + drivers/gpu/drm/radeon/ni_dpm.h | 12 +- + drivers/gpu/drm/radeon/nislands_smc.h | 51 +- + drivers/gpu/drm/radeon/r100.c | 2 +- + drivers/gpu/drm/radeon/r300_reg.h | 2 +- + drivers/gpu/drm/radeon/r600.c | 3 +- + drivers/gpu/drm/radeon/r600_dpm.c | 6 +- + drivers/gpu/drm/radeon/r600_dpm.h | 3 +- + drivers/gpu/drm/radeon/radeon.h | 6 +- + drivers/gpu/drm/radeon/radeon_asic.c | 8 +- + drivers/gpu/drm/radeon/radeon_atombios.c | 44 +- + drivers/gpu/drm/radeon/radeon_atpx_handler.c | 12 +- + drivers/gpu/drm/radeon/radeon_audio.c | 11 +- + drivers/gpu/drm/radeon/radeon_audio.h | 6 +- + drivers/gpu/drm/radeon/radeon_mode.h | 9 +- + drivers/gpu/drm/radeon/radeon_pm.c | 4 +- + drivers/gpu/drm/radeon/rs400.c | 4 +- + drivers/gpu/drm/radeon/rs600.c | 3 +- + drivers/gpu/drm/radeon/rv515.c | 3 +- + drivers/gpu/drm/radeon/rv6xx_dpm.h | 3 +- + drivers/gpu/drm/radeon/rv770_dpm.c | 4 +- + drivers/gpu/drm/radeon/rv770_smc.h | 27 +- + drivers/gpu/drm/radeon/si.c | 63 +- + drivers/gpu/drm/radeon/si_dpm.c | 132 ++-- + drivers/gpu/drm/radeon/si_dpm.h | 21 +- + drivers/gpu/drm/radeon/smu7.h | 6 +- + drivers/gpu/drm/radeon/smu7_discrete.h | 51 +- + drivers/gpu/drm/radeon/smu7_fusion.h | 42 +- + drivers/gpu/drm/radeon/sumo_dpm.c | 18 +- + drivers/gpu/drm/radeon/trinity_dpm.c | 22 +- + drivers/gpu/drm/radeon/trinity_dpm.h | 3 +- + drivers/gpu/drm/radeon/uvd_v1_0.c | 2 +- + include/uapi/drm/amdgpu_drm.h | 2 + + include/uapi/linux/kfd_ioctl.h | 3 +- + 285 files changed, 6426 insertions(+), 2236 deletions(-) + create mode 100644 Documentation/gpu/amdgpu/display/dcn-blocks.rst + create mode 100644 Documentation/gpu/amdgpu/display/display-contributing.rst + create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c + create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h + delete mode 100644 drivers/gpu/drm/amd/display/TODO +Merging drm-intel/for-linux-next (fe4c6ff50c68 drm/i915/xe2lpd: Move registers to PICA) +$ git merge -m Merge branch 'for-linux-next' of git://anongit.freedesktop.org/drm-intel drm-intel/for-linux-next +Auto-merging drivers/gpu/drm/i915/display/intel_backlight.c +Auto-merging drivers/gpu/drm/i915/display/intel_dp.c +Auto-merging drivers/gpu/drm/i915/display/intel_gmbus.c +Auto-merging drivers/gpu/drm/i915/display/intel_psr.c +Auto-merging drivers/gpu/drm/i915/display/intel_sdvo.c +Merge made by the 'ort' strategy. + drivers/gpu/drm/i915/display/intel_atomic_plane.c | 6 +- + drivers/gpu/drm/i915/display/intel_backlight.c | 2 +- + drivers/gpu/drm/i915/display/intel_bios.c | 36 +- + drivers/gpu/drm/i915/display/intel_bios.h | 5 +- + drivers/gpu/drm/i915/display/intel_cdclk.c | 361 ++++++++++----------- + drivers/gpu/drm/i915/display/intel_crt.c | 5 + + drivers/gpu/drm/i915/display/intel_crtc.c | 128 +------- + drivers/gpu/drm/i915/display/intel_cursor.c | 63 +++- + drivers/gpu/drm/i915/display/intel_cx0_phy.c | 231 ++++++------- + drivers/gpu/drm/i915/display/intel_cx0_phy_regs.h | 63 +++- + drivers/gpu/drm/i915/display/intel_ddi.c | 67 ++-- + drivers/gpu/drm/i915/display/intel_display.c | 29 +- + drivers/gpu/drm/i915/display/intel_display_core.h | 16 +- + .../gpu/drm/i915/display/intel_display_debugfs.c | 26 +- + .../gpu/drm/i915/display/intel_display_device.c | 2 +- + .../gpu/drm/i915/display/intel_display_driver.c | 149 ++++++++- + .../gpu/drm/i915/display/intel_display_driver.h | 6 + + drivers/gpu/drm/i915/display/intel_display_irq.c | 10 +- + drivers/gpu/drm/i915/display/intel_display_types.h | 25 +- + drivers/gpu/drm/i915/display/intel_dmc.c | 2 +- + drivers/gpu/drm/i915/display/intel_dp.c | 192 ++++++----- + drivers/gpu/drm/i915/display/intel_dp.h | 10 +- + drivers/gpu/drm/i915/display/intel_dp_aux.c | 29 +- + drivers/gpu/drm/i915/display/intel_dp_mst.c | 4 + + drivers/gpu/drm/i915/display/intel_dpll_mgr.c | 83 +++-- + drivers/gpu/drm/i915/display/intel_dpll_mgr.h | 18 +- + drivers/gpu/drm/i915/display/intel_dsb.c | 4 + + drivers/gpu/drm/i915/display/intel_dvo.c | 5 + + drivers/gpu/drm/i915/display/intel_gmbus.c | 5 +- + drivers/gpu/drm/i915/display/intel_hdcp.c | 78 +++-- + drivers/gpu/drm/i915/display/intel_hdcp_regs.h | 28 +- + drivers/gpu/drm/i915/display/intel_hdmi.c | 16 +- + drivers/gpu/drm/i915/display/intel_hotplug.c | 165 ++++++++-- + drivers/gpu/drm/i915/display/intel_hotplug.h | 4 + + drivers/gpu/drm/i915/display/intel_hotplug_irq.c | 6 +- + drivers/gpu/drm/i915/display/intel_opregion.c | 176 +++++++--- + drivers/gpu/drm/i915/display/intel_opregion.h | 47 ++- + drivers/gpu/drm/i915/display/intel_panel.c | 4 + + drivers/gpu/drm/i915/display/intel_pps.c | 2 +- + drivers/gpu/drm/i915/display/intel_psr.c | 130 ++++++-- + drivers/gpu/drm/i915/display/intel_psr.h | 6 - + drivers/gpu/drm/i915/display/intel_psr_regs.h | 6 + + drivers/gpu/drm/i915/display/intel_sdvo.c | 6 + + drivers/gpu/drm/i915/display/intel_tc.c | 40 +-- + drivers/gpu/drm/i915/display/intel_tc.h | 2 +- + drivers/gpu/drm/i915/display/intel_tv.c | 7 +- + drivers/gpu/drm/i915/display/intel_vblank.c | 130 ++++++++ + drivers/gpu/drm/i915/display/intel_vblank.h | 12 + + drivers/gpu/drm/i915/display/skl_watermark.c | 16 +- + drivers/gpu/drm/i915/gem/i915_gem_ttm.c | 5 +- + drivers/gpu/drm/i915/gvt/kvmgt.c | 2 +- + drivers/gpu/drm/i915/i915_driver.c | 25 +- + drivers/gpu/drm/i915/i915_reg.h | 8 + + drivers/gpu/drm/i915/soc/intel_pch.c | 16 +- + drivers/gpu/drm/i915/soc/intel_pch.h | 6 +- + include/drm/display/drm_dp.h | 1 + + include/drm/drm_print.h | 3 + + include/drm/i915_pciids.h | 3 + + 58 files changed, 1613 insertions(+), 919 deletions(-) +Merging drm-tegra/for-next (2429b3c529da drm/tegra: Avoid potential 32-bit integer overflow) +$ git merge -m Merge branch 'for-next' of https://gitlab.freedesktop.org/drm/tegra.git drm-tegra/for-next +Already up to date. +Merging drm-msm/msm-next (d4ca26ac4be0 drm/msm/dp: call dp_display_get_next_bridge() during probe) +$ git merge -m Merge branch 'msm-next' of https://gitlab.freedesktop.org/drm/msm.git drm-msm/msm-next +Already up to date. +Merging drm-msm-lumag/msm-next-lumag (d4ca26ac4be0 drm/msm/dp: call dp_display_get_next_bridge() during probe) +$ git merge -m Merge branch 'msm-next-lumag' of https://gitlab.freedesktop.org/lumag/msm.git drm-msm-lumag/msm-next-lumag +Already up to date. +Merging etnaviv/etnaviv/next (c9959996a8fc drm/etnaviv: add sensitive state for PE_RT_ADDR_4_PIPE(3, 0|1) address) +$ git merge -m Merge branch 'etnaviv/next' of https://git.pengutronix.de/git/lst/linux etnaviv/etnaviv/next +Auto-merging drivers/gpu/drm/etnaviv/etnaviv_drv.c +Auto-merging drivers/gpu/drm/etnaviv/etnaviv_gpu.c +Merge made by the 'ort' strategy. + drivers/gpu/drm/etnaviv/etnaviv_cmd_parser.c | 1 + + drivers/gpu/drm/etnaviv/etnaviv_drv.c | 93 ++++++++++++++++++---------- + drivers/gpu/drm/etnaviv/etnaviv_gem.c | 12 ++-- + drivers/gpu/drm/etnaviv/etnaviv_gpu.c | 33 +++++++++- + drivers/gpu/drm/etnaviv/etnaviv_gpu.h | 12 ++++ + drivers/gpu/drm/etnaviv/etnaviv_hwdb.c | 34 ++++++++++ + drivers/gpu/drm/etnaviv/etnaviv_mmu.c | 4 +- + drivers/gpu/drm/etnaviv/etnaviv_perfmon.c | 4 +- + include/uapi/drm/etnaviv_drm.h | 5 ++ + 9 files changed, 154 insertions(+), 44 deletions(-) +Merging fbdev/for-next (72fee6b0a3a4 fbdev: Restrict FB_SH_MOBILE_LCDC to SuperH) +$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/deller/linux-fbdev.git fbdev/for-next +Merge made by the 'ort' strategy. + drivers/video/fbdev/Kconfig | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) +Merging regmap/for-next (a1214cdfe92b Merge branch 'regmap-linus' into regmap-next) +$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/broonie/regmap.git regmap/for-next +Merge made by the 'ort' strategy. +Merging sound/for-next (8b87a7863fa5 Merge branch 'topic/format-kunit' into for-next) +$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/tiwai/sound.git sound/for-next +Auto-merging MAINTAINERS +Auto-merging sound/pci/hda/patch_realtek.c +Merge made by the 'ort' strategy. + MAINTAINERS | 7 + + include/sound/emux_synth.h | 2 +- + sound/core/Kconfig | 16 ++ + sound/core/Makefile | 2 + + sound/core/pcm.c | 6 +- + sound/core/sound_kunit.c | 310 +++++++++++++++++++++++++++++++++ + sound/firewire/Kconfig | 2 + + sound/firewire/motu/motu-protocol-v3.c | 9 + + sound/firewire/motu/motu.c | 2 + + sound/firewire/motu/motu.h | 1 + + sound/pci/hda/Kconfig | 4 + + sound/pci/hda/Makefile | 2 + + sound/pci/hda/cs35l41_hda_property.c | 90 ++++++++-- + sound/pci/hda/hda_component.c | 169 ++++++++++++++++++ + sound/pci/hda/hda_component.h | 59 +++++++ + sound/pci/hda/patch_realtek.c | 271 +++++++++------------------- + sound/synth/emux/emux.c | 4 +- + 17 files changed, 744 insertions(+), 212 deletions(-) + create mode 100644 sound/core/sound_kunit.c + create mode 100644 sound/pci/hda/hda_component.c +Merging ieee1394/for-next (dd754748f1be firewire: Convert snprintf/sprintf to sysfs_emit) +$ git merge -m Merge branch 'for-next' of https://git.kernel.org/pub/scm/linux/kernel/git/ieee1394/linux1394.git ieee1394/for-next +Merge made by the 'ort' strategy. + drivers/firewire/core-device.c | 16 ++++------------ + 1 file changed, 4 insertions(+), 12 deletions(-) +Merging sound-asoc/for-next (e3468b7aab5c Merge remote-tracking branch 'asoc/for-6.9' into asoc-next) +$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/broonie/sound.git sound-asoc/for-next +Merge made by the 'ort' strategy. + .../bindings/sound/audio-graph-port.yaml | 2 +- + .../devicetree/bindings/sound/fsl,easrc.yaml | 4 +- + .../devicetree/bindings/sound/fsl,micfil.yaml | 14 +- + .../devicetree/bindings/sound/fsl,sai.yaml | 6 + + .../bindings/sound/infineon,peb2466.yaml | 2 +- + .../devicetree/bindings/sound/qcom,wcd938x.yaml | 81 +- + .../bindings/sound/qcom,wcd939x-sdw.yaml | 69 + + .../devicetree/bindings/sound/qcom,wcd939x.yaml | 96 + + .../bindings/sound/qcom,wcd93xx-common.yaml | 95 + + .../devicetree/bindings/sound/samsung,tm2.yaml | 7 +- + drivers/soundwire/Makefile | 2 +- + drivers/soundwire/amd_init.c | 235 ++ + drivers/soundwire/amd_init.h | 13 + + drivers/soundwire/amd_manager.c | 47 +- + drivers/soundwire/amd_manager.h | 16 +- + include/linux/soundwire/sdw_amd.h | 83 +- + include/sound/sof/dai-amd.h | 7 + + include/sound/sof/dai.h | 2 + + include/uapi/sound/sof/tokens.h | 4 + + sound/soc/amd/acp/Kconfig | 7 + + sound/soc/amd/acp/Makefile | 2 + + sound/soc/amd/acp/acp-mach-common.c | 6 +- + sound/soc/amd/acp/acp-sof-mach.c | 26 +- + sound/soc/amd/acp/amd-sdw-acpi.c | 62 + + sound/soc/atmel/mikroe-proto.c | 8 +- + sound/soc/codecs/Kconfig | 34 + + sound/soc/codecs/Makefile | 9 + + sound/soc/codecs/cs42l43-jack.c | 27 +- + sound/soc/codecs/cs42l43-sdw.c | 1 + + sound/soc/codecs/cs42l43.c | 64 +- + sound/soc/codecs/cs42l43.h | 25 +- + sound/soc/codecs/es8326.c | 92 +- + sound/soc/codecs/es8326.h | 5 +- + sound/soc/codecs/framer-codec.c | 413 +++ + sound/soc/codecs/nau8540.c | 112 +- + sound/soc/codecs/nau8540.h | 13 +- + sound/soc/codecs/wcd-clsh-v2.h | 1 + + sound/soc/codecs/wcd-mbhc-v2.c | 95 +- + sound/soc/codecs/wcd-mbhc-v2.h | 3 + + sound/soc/codecs/wcd939x-sdw.c | 1551 ++++++++ + sound/soc/codecs/wcd939x.c | 3686 ++++++++++++++++++++ + sound/soc/codecs/wcd939x.h | 989 ++++++ + sound/soc/fsl/eukrea-tlv320.c | 8 +- + sound/soc/fsl/fsl_sai.c | 13 + + sound/soc/fsl/p1022_rdk.c | 33 +- + sound/soc/intel/common/soc-acpi-intel-mtl-match.c | 57 + + sound/soc/qcom/common.c | 2 +- + sound/soc/sh/rz-ssi.c | 2 +- + sound/soc/sof/amd/Kconfig | 18 + + sound/soc/sof/amd/acp-common.c | 65 +- + sound/soc/sof/amd/acp-dsp-offset.h | 10 + + sound/soc/sof/amd/acp-loader.c | 34 +- + sound/soc/sof/amd/acp.c | 232 +- + sound/soc/sof/amd/acp.h | 26 +- + sound/soc/sof/amd/pci-acp63.c | 7 + + sound/soc/sof/fw-file-profile.c | 18 +- + sound/soc/sof/ipc3-pcm.c | 25 + + sound/soc/sof/ipc3-topology.c | 40 + + sound/soc/sof/sof-audio.h | 1 + + sound/soc/sof/topology.c | 5 + + sound/soc/ti/j721e-evm.c | 4 +- + sound/soc/ti/omap-hdmi.c | 10 +- + 62 files changed, 8283 insertions(+), 343 deletions(-) + create mode 100644 Documentation/devicetree/bindings/sound/qcom,wcd939x-sdw.yaml + create mode 100644 Documentation/devicetree/bindings/sound/qcom,wcd939x.yaml + create mode 100644 Documentation/devicetree/bindings/sound/qcom,wcd93xx-common.yaml + create mode 100644 drivers/soundwire/amd_init.c + create mode 100644 drivers/soundwire/amd_init.h + create mode 100644 sound/soc/amd/acp/amd-sdw-acpi.c + create mode 100644 sound/soc/codecs/framer-codec.c + create mode 100644 sound/soc/codecs/wcd939x-sdw.c + create mode 100644 sound/soc/codecs/wcd939x.c + create mode 100644 sound/soc/codecs/wcd939x.h +Merging modules/modules-next (3559ad395bf0 module: Change module_enable_{nx/x/ro}() to more explicit names) +$ git merge -m Merge branch 'modules-next' of git://git.kernel.org/pub/scm/linux/kernel/git/mcgrof/linux.git modules/modules-next +Auto-merging kernel/module/main.c +Merge made by the 'ort' strategy. + kernel/module/internal.h | 6 +++--- + kernel/module/main.c | 8 ++++---- + kernel/module/strict_rwx.c | 16 +++++++++------- + 3 files changed, 16 insertions(+), 14 deletions(-) +Merging input/next (7d0f351da460 Input: matrix_keypad - switch to using managed resources) +$ git merge -m Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/dtor/input.git input/next +Merge made by the 'ort' strategy. + .../bindings/input/touchscreen/goodix,gt9916.yaml | 95 +++ + .../bindings/input/touchscreen/goodix.yaml | 5 +- + .../bindings/input/touchscreen/melfas,mms114.yaml | 6 +- + .../bindings/input/touchscreen/silead,gsl1680.yaml | 2 +- + drivers/input/input-leds.c | 8 +- + drivers/input/input.c | 14 +- + drivers/input/keyboard/bcm-keypad.c | 2 +- + drivers/input/keyboard/matrix_keypad.c | 170 ++--- + drivers/input/misc/88pm80x_onkey.c | 14 +- + drivers/input/mouse/Kconfig | 12 - + drivers/input/mouse/Makefile | 1 - + drivers/input/mouse/navpoint.c | 350 ---------- + drivers/input/rmi4/rmi_driver.c | 6 +- + drivers/input/touchscreen/Kconfig | 31 + + drivers/input/touchscreen/Makefile | 3 + + drivers/input/touchscreen/goodix_berlin.h | 24 + + drivers/input/touchscreen/goodix_berlin_core.c | 755 +++++++++++++++++++++ + drivers/input/touchscreen/goodix_berlin_i2c.c | 75 ++ + drivers/input/touchscreen/goodix_berlin_spi.c | 178 +++++ + include/linux/input/navpoint.h | 8 - + 20 files changed, 1244 insertions(+), 515 deletions(-) + create mode 100644 Documentation/devicetree/bindings/input/touchscreen/goodix,gt9916.yaml + delete mode 100644 drivers/input/mouse/navpoint.c + create mode 100644 drivers/input/touchscreen/goodix_berlin.h + create mode 100644 drivers/input/touchscreen/goodix_berlin_core.c + create mode 100644 drivers/input/touchscreen/goodix_berlin_i2c.c + create mode 100644 drivers/input/touchscreen/goodix_berlin_spi.c + delete mode 100644 include/linux/input/navpoint.h +Merging block/for-next (b48b5a7c9bc1 Merge branch 'block-deadline' into for-next) +$ git merge -m Merge branch 'for-next' of git://git.kernel.dk/linux-block.git block/for-next +Auto-merging include/linux/sched.h +Merge made by the 'ort' strategy. + block/bfq-cgroup.c | 14 ++--- + block/bfq-iosched.c | 148 +++++++++++++++++++++++++++++++++++----------- + block/bfq-iosched.h | 16 ++++- + block/blk-cgroup.c | 2 +- + block/blk-cgroup.h | 1 + + block/blk-core.c | 3 + + block/blk-flush.c | 2 +- + block/blk-iocost.c | 8 +-- + block/blk-iolatency.c | 6 +- + block/blk-mq.c | 16 ++--- + block/blk-throttle.c | 6 +- + block/blk-wbt.c | 6 +- + block/blk.h | 67 +++++++++++++++++++++ + block/mq-deadline.c | 114 ++++++++++++++++++++++++++++------- + include/linux/blk_types.h | 42 ------------- + include/linux/blkdev.h | 17 ++++++ + include/linux/sched.h | 2 +- + kernel/sched/core.c | 6 +- + 18 files changed, 342 insertions(+), 134 deletions(-) +Merging device-mapper/for-next (4eacc39d5529 dm-crypt, dm-verity: disable tasklets) +$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm.git device-mapper/for-next +Merge made by the 'ort' strategy. + drivers/md/dm-core.h | 2 ++ + drivers/md/dm-crypt.c | 38 ++------------------------------------ + drivers/md/dm-ioctl.c | 3 ++- + drivers/md/dm-stats.c | 9 +++++++++ + drivers/md/dm-table.c | 9 +++++++-- + drivers/md/dm-verity-target.c | 27 ++------------------------- + drivers/md/dm-writecache.c | 8 ++++---- + 7 files changed, 28 insertions(+), 68 deletions(-) +Merging libata/for-next (c8474c7273ac Merge remote-tracking branch 'libata/for-6.8-fixes' into for-next) +$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/libata/linux libata/for-next +Merge made by the 'ort' strategy. + drivers/ata/ahci.c | 10 +++++----- + 1 file changed, 5 insertions(+), 5 deletions(-) +Merging pcmcia/pcmcia-next (4f733de8b78a pcmcia: tcic: remove unneeded "&" in call to setup_timer()) +$ git merge -m Merge branch 'pcmcia-next' of git://git.kernel.org/pub/scm/linux/kernel/git/brodo/linux.git pcmcia/pcmcia-next +Already up to date. +Merging mmc/next (4e99ffb173fa mmc: core Drop BLK_BOUNCE_HIGH) +$ git merge -m Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/ulfh/mmc.git mmc/next +Merge made by the 'ort' strategy. + Documentation/devicetree/bindings/mmc/fsl-imx-esdhc.yaml | 3 ++- + drivers/mmc/core/block.c | 12 ++++++------ + drivers/mmc/core/host.c | 5 +++-- + drivers/mmc/core/queue.c | 2 -- + 4 files changed, 11 insertions(+), 11 deletions(-) +Merging mfd/for-mfd-next (1e0ea9e75ff3 mfd: wm831x: Remove redundant forever while loop) +$ git merge -m Merge branch 'for-mfd-next' of git://git.kernel.org/pub/scm/linux/kernel/git/lee/mfd.git mfd/for-mfd-next +Merge made by the 'ort' strategy. + Documentation/devicetree/bindings/mfd/iqs62x.yaml | 2 +- + .../devicetree/bindings/mfd/qcom,tcsr.yaml | 2 + + drivers/mfd/cros_ec_dev.c | 9 +++++ + drivers/mfd/intel-lpss-pci.c | 28 ++++++++++---- + drivers/mfd/intel-lpss.c | 9 ++++- + drivers/mfd/intel-lpss.h | 14 ++++++- + drivers/mfd/lpc_ich.c | 3 +- + drivers/mfd/omap-usb-host.c | 2 +- + drivers/mfd/rave-sp.c | 2 +- + drivers/mfd/wm831x-auxadc.c | 43 +++++++++------------- + include/linux/mfd/sun4i-gpadc.h | 4 +- + 11 files changed, 77 insertions(+), 41 deletions(-) +Merging backlight/for-backlight-next (3b75d271e161 backlight: hx8357: Fix potential NULL pointer dereference) +$ git merge -m Merge branch 'for-backlight-next' of git://git.kernel.org/pub/scm/linux/kernel/git/lee/backlight.git backlight/for-backlight-next +Merge made by the 'ort' strategy. + drivers/video/backlight/hx8357.c | 10 ++++++---- + 1 file changed, 6 insertions(+), 4 deletions(-) +Merging battery/for-next (4c5d387d79a6 power: supply: twl4030_madc: Use devm_power_supply_register() helper) +$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/sre/linux-power-supply.git battery/for-next +Merge made by the 'ort' strategy. + drivers/power/supply/bq27xxx_battery.c | 41 +++++++++++----- + drivers/power/supply/bq27xxx_battery_i2c.c | 46 ++++++++---------- + drivers/power/supply/da9030_battery.c | 6 +-- + drivers/power/supply/da9052-battery.c | 4 +- + drivers/power/supply/da9150-charger.c | 72 ++++++++--------------------- + drivers/power/supply/ds2760_battery.c | 4 +- + drivers/power/supply/goldfish_battery.c | 24 +++------- + drivers/power/supply/lp8727_charger.c | 35 +++----------- + drivers/power/supply/lp8788-charger.c | 21 +++------ + drivers/power/supply/pcf50633-charger.c | 23 ++++----- + drivers/power/supply/rt5033_battery.c | 14 ++---- + drivers/power/supply/rx51_battery.c | 57 +++++------------------ + drivers/power/supply/tps65090-charger.c | 18 +++----- + drivers/power/supply/twl4030_madc_battery.c | 59 ++++++----------------- + drivers/power/supply/wm831x_backup.c | 13 +----- + drivers/power/supply/wm831x_power.c | 24 ++++------ + include/linux/power/bq27xxx_battery.h | 1 - + 17 files changed, 147 insertions(+), 315 deletions(-) +Merging regulator/for-next (a2fc922ece40 Merge remote-tracking branch 'regulator/for-6.9' into regulator-next) +$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/broonie/regulator.git regulator/for-next +Merge made by the 'ort' strategy. + .../devicetree/bindings/regulator/ti,tps65132.yaml | 84 ++++++++++++++++++++++ + .../bindings/regulator/tps65132-regulator.txt | 46 ------------ + drivers/regulator/fixed-helper.c | 4 +- + drivers/regulator/qcom_smd-regulator.c | 19 ++--- + 4 files changed, 96 insertions(+), 57 deletions(-) + create mode 100644 Documentation/devicetree/bindings/regulator/ti,tps65132.yaml + delete mode 100644 Documentation/devicetree/bindings/regulator/tps65132-regulator.txt +Merging security/next (5a287d3d2b9d lsm: fix default return value of the socket_getpeersec_*() hooks) +$ git merge -m Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/pcmoore/lsm.git security/next +Auto-merging include/linux/lsm_hook_defs.h +Auto-merging security/security.c +Merge made by the 'ort' strategy. + include/linux/lsm_hook_defs.h | 4 ++-- + security/security.c | 45 ++++++++++++++++++++++++++++++++++++++----- + 2 files changed, 42 insertions(+), 7 deletions(-) +Merging apparmor/apparmor-next (8ead196be219 apparmor: Fix memory leak in unpack_profile()) +$ git merge -m Merge branch 'apparmor-next' of git://git.kernel.org/pub/scm/linux/kernel/git/jj/linux-apparmor apparmor/apparmor-next +Already up to date. +Merging integrity/next-integrity (1ed4b5631002 Revert "KEYS: encrypted: Add check for strsep") +$ git merge -m Merge branch 'next-integrity' of git://git.kernel.org/pub/scm/linux/kernel/git/zohar/linux-integrity integrity/next-integrity +Already up to date. +Merging selinux/next (90593caf7db7 selinux: reduce the object class calculations at inode init time) +$ git merge -m Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/pcmoore/selinux.git selinux/next +Auto-merging security/selinux/hooks.c +Merge made by the 'ort' strategy. + security/selinux/hooks.c | 9 ++++----- + 1 file changed, 4 insertions(+), 5 deletions(-) +Merging smack/next (f0816d4332c3 ramfs: Initialize security of in-memory inodes) +$ git merge -m Merge branch 'next' of git://github.com/cschaufler/smack-next smack/next +Merge made by the 'ort' strategy. + fs/ramfs/inode.c | 32 +++++++++++++- + security/smack/smack_lsm.c | 105 ++++++++++++++++++++++++++------------------- + 2 files changed, 91 insertions(+), 46 deletions(-) +Merging tomoyo/master (0bb80ecc33a8 Linux 6.6-rc1) +$ git merge -m Merge branch 'master' of https://scm.osdn.net/gitroot/tomoyo/tomoyo-test1.git tomoyo/master +Already up to date. +Merging tpmdd/next (610347effc2e Merge tag 'Wstringop-overflow-for-6.8-rc2' of git://git.kernel.org/pub/scm/linux/kernel/git/gustavoars/linux) +$ git merge -m Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/jarkko/linux-tpmdd.git tpmdd/next +Already up to date. +Merging watchdog/master (41bccc98fb79 Linux 6.8-rc2) +$ git merge -m Merge branch 'master' of git://www.linux-watchdog.org/linux-watchdog-next.git watchdog/master +Already up to date. +Merging iommu/next (75f74f85a42e Merge branches 'apple/dart', 'arm/rockchip', 'arm/smmu', 'virtio', 'x86/vt-d', 'x86/amd' and 'core' into next) +$ git merge -m Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/joro/iommu.git iommu/next +Already up to date. +Merging audit/next (aa13b709084a audit: use KMEM_CACHE() instead of kmem_cache_create()) +$ git merge -m Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/pcmoore/audit.git audit/next +Merge made by the 'ort' strategy. + kernel/audit.c | 4 +--- + kernel/auditfilter.c | 2 +- + 2 files changed, 2 insertions(+), 4 deletions(-) +Merging devicetree/for-next (85f838adad54 dt-bindings: fpga: Convert fpga-region binding to yaml) +$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/robh/linux.git devicetree/for-next +Auto-merging Documentation/devicetree/bindings/Makefile +Auto-merging MAINTAINERS +Auto-merging drivers/of/property.c +Merge made by the 'ort' strategy. + Documentation/devicetree/bindings/Makefile | 3 - + .../devicetree/bindings/fpga/fpga-region.txt | 479 --------------------- + .../devicetree/bindings/fpga/fpga-region.yaml | 358 +++++++++++++++ + .../devicetree/bindings/gpio/mrvl-gpio.yaml | 2 +- + Documentation/devicetree/bindings/i2c/i2c-pxa.yaml | 2 +- + .../mediatek,mt6577-sysirq.yaml | 85 ++++ + .../interrupt-controller/mediatek,sysirq.txt | 44 -- + .../devicetree/bindings/rtc/sa1100-rtc.yaml | 2 +- + .../devicetree/bindings/submitting-patches.rst | 23 +- + .../devicetree/bindings/timer/mrvl,mmp-timer.yaml | 2 +- + .../devicetree/bindings/trivial-devices.yaml | 2 + + MAINTAINERS | 5 +- + drivers/of/property.c | 2 +- + 13 files changed, 461 insertions(+), 548 deletions(-) + delete mode 100644 Documentation/devicetree/bindings/fpga/fpga-region.txt + create mode 100644 Documentation/devicetree/bindings/fpga/fpga-region.yaml + create mode 100644 Documentation/devicetree/bindings/interrupt-controller/mediatek,mt6577-sysirq.yaml + delete mode 100644 Documentation/devicetree/bindings/interrupt-controller/mediatek,sysirq.txt +Merging dt-krzk/for-next (8c82b4eef297 ARM: dts: sti: minor whitespace cleanup around '=') +$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/krzk/linux-dt.git dt-krzk/for-next +Merge made by the 'ort' strategy. + arch/arm/boot/dts/marvell/dove-cubox.dts | 4 ++-- + arch/arm/boot/dts/marvell/mmp2-brownstone.dts | 2 +- + arch/arm/boot/dts/st/stih407-pinctrl.dtsi | 8 ++++---- + arch/arm/boot/dts/ti/davinci/da850.dtsi | 4 ++-- + 4 files changed, 9 insertions(+), 9 deletions(-) +Merging mailbox/for-next (cd795fb0c352 mailbox: mtk-cmdq: Add CMDQ driver support for mt8188) +$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/jassibrar/mailbox.git mailbox/for-next +Already up to date. +Merging spi/for-next (60fbb72e3018 Merge remote-tracking branch 'spi/for-6.9' into spi-next) +$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/broonie/spi.git spi/for-next +Merge made by the 'ort' strategy. + .../devicetree/bindings/spi/samsung,spi.yaml | 1 + + .../devicetree/bindings/spi/spi-fsl-lpspi.yaml | 1 + + .../devicetree/bindings/spi/spi-nxp-fspi.yaml | 18 ++++-- + drivers/spi/Kconfig | 2 +- + drivers/spi/spi-mt65xx.c | 5 ++ + drivers/spi/spi-nxp-fspi.c | 2 +- + drivers/spi/spi-s3c64xx.c | 27 ++++++--- + drivers/spi/spi.c | 69 +++------------------- + include/linux/spi/spi.h | 2 +- + 9 files changed, 51 insertions(+), 76 deletions(-) +Merging tip/master (078b7b997b47 Merge x86/boot into tip/master) +$ git merge -m Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git tip/master +Auto-merging arch/x86/Kconfig +Auto-merging arch/x86/Makefile +Auto-merging arch/x86/include/asm/pgtable.h +Auto-merging arch/x86/kernel/Makefile +Auto-merging arch/x86/kernel/alternative.c +Auto-merging arch/x86/kernel/cpu/mshyperv.c +Auto-merging arch/x86/kernel/kvm.c +Auto-merging arch/x86/kernel/smp.c +Auto-merging arch/x86/mm/dump_pagetables.c +Auto-merging arch/x86/mm/tlb.c +Auto-merging arch/x86/net/bpf_jit_comp.c +Auto-merging scripts/mod/modpost.c +Auto-merging tools/arch/x86/include/asm/cpufeatures.h +Auto-merging tools/arch/x86/include/asm/msr-index.h +Merge made by the 'ort' strategy. + Documentation/admin-guide/hw-vuln/spectre.rst | 8 +- + Documentation/admin-guide/kernel-parameters.txt | 11 +- + Documentation/arch/x86/pti.rst | 6 +- + Documentation/arch/x86/x86_64/fred.rst | 96 ++++++++ + Documentation/arch/x86/x86_64/index.rst | 1 + + Documentation/process/maintainer-tip.rst | 4 +- + arch/x86/Kconfig | 53 +++-- + arch/x86/Makefile | 8 +- + arch/x86/boot/compressed/ident_map_64.c | 4 +- + arch/x86/boot/compressed/sev.c | 4 + + arch/x86/configs/i386_defconfig | 2 +- + arch/x86/entry/Makefile | 5 +- + arch/x86/entry/calling.h | 55 +++-- + arch/x86/entry/entry_32.S | 6 +- + arch/x86/entry/entry_64.S | 29 ++- + arch/x86/entry/entry_64_fred.S | 131 +++++++++++ + arch/x86/entry/entry_fred.c | 294 ++++++++++++++++++++++++ + arch/x86/entry/vdso/Makefile | 4 +- + arch/x86/entry/vsyscall/vsyscall_64.c | 2 +- + arch/x86/include/asm/asm-prototypes.h | 1 + + arch/x86/include/asm/cpufeatures.h | 2 + + arch/x86/include/asm/current.h | 9 +- + arch/x86/include/asm/desc.h | 2 - + arch/x86/include/asm/disabled-features.h | 18 +- + arch/x86/include/asm/extable_fixup_types.h | 4 +- + arch/x86/include/asm/fpu/sched.h | 10 +- + arch/x86/include/asm/fred.h | 97 ++++++++ + arch/x86/include/asm/ia32.h | 4 +- + arch/x86/include/asm/idtentry.h | 88 ++++++- + arch/x86/include/asm/linkage.h | 16 +- + arch/x86/include/asm/msr-index.h | 13 +- + arch/x86/include/asm/msr.h | 18 ++ + arch/x86/include/asm/nospec-branch.h | 53 ++--- + arch/x86/include/asm/page.h | 6 +- + arch/x86/include/asm/percpu.h | 191 +++++++++++---- + arch/x86/include/asm/pgalloc.h | 2 +- + arch/x86/include/asm/pgtable-3level.h | 2 +- + arch/x86/include/asm/pgtable.h | 18 +- + arch/x86/include/asm/pgtable_64.h | 3 +- + arch/x86/include/asm/preempt.h | 2 +- + arch/x86/include/asm/processor-flags.h | 2 +- + arch/x86/include/asm/processor.h | 3 + + arch/x86/include/asm/pti.h | 2 +- + arch/x86/include/asm/ptrace.h | 104 +++++++-- + arch/x86/include/asm/static_call.h | 2 +- + arch/x86/include/asm/switch_to.h | 8 +- + arch/x86/include/asm/text-patching.h | 2 + + arch/x86/include/asm/thread_info.h | 12 +- + arch/x86/include/asm/trapnr.h | 12 + + arch/x86/include/asm/uaccess_64.h | 11 +- + arch/x86/include/asm/vmx.h | 17 +- + arch/x86/include/uapi/asm/processor-flags.h | 7 + + arch/x86/kernel/Makefile | 1 + + arch/x86/kernel/acpi/wakeup_64.S | 24 +- + arch/x86/kernel/alternative.c | 23 +- + arch/x86/kernel/asm-offsets.c | 2 +- + arch/x86/kernel/callthunks.c | 32 ++- + arch/x86/kernel/cpu/acrn.c | 4 +- + arch/x86/kernel/cpu/amd.c | 2 +- + arch/x86/kernel/cpu/bugs.c | 43 ++-- + arch/x86/kernel/cpu/common.c | 39 +++- + arch/x86/kernel/cpu/cpuid-deps.c | 2 + + arch/x86/kernel/cpu/mce/core.c | 26 +++ + arch/x86/kernel/cpu/mshyperv.c | 15 +- + arch/x86/kernel/cpu/resctrl/core.c | 18 +- + arch/x86/kernel/cpu/resctrl/internal.h | 8 +- + arch/x86/kernel/cpu/resctrl/monitor.c | 48 ++-- + arch/x86/kernel/cpu/resctrl/rdtgroup.c | 29 +-- + arch/x86/kernel/dumpstack.c | 2 +- + arch/x86/kernel/espfix_64.c | 8 + + arch/x86/kernel/fred.c | 59 +++++ + arch/x86/kernel/ftrace.c | 3 +- + arch/x86/kernel/head_32.S | 4 +- + arch/x86/kernel/head_64.S | 39 +--- + arch/x86/kernel/idt.c | 4 +- + arch/x86/kernel/irqinit.c | 7 +- + arch/x86/kernel/kprobes/opt.c | 2 +- + arch/x86/kernel/kvm.c | 2 +- + arch/x86/kernel/ldt.c | 8 +- + arch/x86/kernel/nmi.c | 48 +++- + arch/x86/kernel/process_32.c | 7 +- + arch/x86/kernel/process_64.c | 74 ++++-- + arch/x86/kernel/sev-shared.c | 102 +++++++- + arch/x86/kernel/sev.c | 5 +- + arch/x86/kernel/smp.c | 10 +- + arch/x86/kernel/static_call.c | 2 +- + arch/x86/kernel/traps.c | 78 ++++++- + arch/x86/kernel/vmlinux.lds.S | 11 +- + arch/x86/kvm/mmu/mmu.c | 2 +- + arch/x86/kvm/mmu/mmu_internal.h | 2 +- + arch/x86/kvm/svm/svm.c | 2 +- + arch/x86/kvm/svm/vmenter.S | 4 +- + arch/x86/kvm/vmx/vmx.c | 14 +- + arch/x86/lib/Makefile | 2 +- + arch/x86/lib/cmpxchg16b_emu.S | 12 +- + arch/x86/lib/cmpxchg8b_emu.S | 30 ++- + arch/x86/lib/getuser.S | 24 +- + arch/x86/lib/putuser.S | 20 +- + arch/x86/lib/retpoline.S | 26 +-- + arch/x86/lib/x86-opcode-map.txt | 4 +- + arch/x86/mm/Makefile | 2 +- + arch/x86/mm/debug_pagetables.c | 4 +- + arch/x86/mm/dump_pagetables.c | 4 +- + arch/x86/mm/extable.c | 78 +++++++ + arch/x86/mm/fault.c | 32 +-- + arch/x86/mm/mem_encrypt.c | 56 ++--- + arch/x86/mm/mem_encrypt_identity.c | 10 +- + arch/x86/mm/pgtable.c | 4 +- + arch/x86/mm/tlb.c | 10 +- + arch/x86/net/bpf_jit_comp.c | 4 +- + arch/x86/net/bpf_jit_comp32.c | 2 +- + arch/x86/purgatory/Makefile | 2 +- + arch/x86/xen/xen-asm.S | 10 +- + drivers/irqchip/irq-gic-v3.c | 51 ++-- + drivers/irqchip/irq-gic.c | 27 +-- + drivers/xen/events/events_base.c | 2 +- + include/linux/bitmap.h | 3 + + include/linux/compiler-gcc.h | 2 +- + include/linux/compiler.h | 2 +- + include/linux/indirect_call_wrapper.h | 2 +- + include/linux/irq.h | 2 +- + include/linux/irqhandler.h | 2 +- + include/linux/module.h | 2 +- + include/linux/objtool.h | 2 +- + include/linux/pti.h | 2 +- + include/net/netfilter/nf_tables_core.h | 2 +- + include/net/tc_wrapper.h | 2 +- + kernel/cpu.c | 5 +- + kernel/irq/irq_sim.c | 28 +-- + kernel/irq/irqdesc.c | 112 +++++---- + kernel/trace/ring_buffer.c | 2 +- + net/netfilter/Makefile | 2 +- + net/netfilter/nf_tables_core.c | 6 +- + net/netfilter/nft_ct.c | 4 +- + net/netfilter/nft_lookup.c | 2 +- + net/sched/sch_api.c | 2 +- + scripts/Makefile.lib | 8 +- + scripts/Makefile.vmlinux_o | 2 +- + scripts/generate_rust_target.rs | 2 +- + scripts/mod/modpost.c | 2 +- + tools/arch/x86/include/asm/cpufeatures.h | 2 + + tools/arch/x86/include/asm/disabled-features.h | 18 +- + tools/arch/x86/include/asm/msr-index.h | 13 +- + tools/arch/x86/lib/x86-opcode-map.txt | 4 +- + tools/objtool/arch/x86/decode.c | 19 +- + tools/objtool/arch/x86/special.c | 2 +- + tools/objtool/check.c | 4 +- + 147 files changed, 2201 insertions(+), 756 deletions(-) + create mode 100644 Documentation/arch/x86/x86_64/fred.rst + create mode 100644 arch/x86/entry/entry_64_fred.S + create mode 100644 arch/x86/entry/entry_fred.c + create mode 100644 arch/x86/include/asm/fred.h + create mode 100644 arch/x86/kernel/fred.c +Merging clockevents/timers/drivers/next (0076a37a426b dt-bindings: timer: renesas,tmu: Document input capture interrupt) +$ git merge -m Merge branch 'timers/drivers/next' of git://git.linaro.org/people/daniel.lezcano/linux.git clockevents/timers/drivers/next +Merge made by the 'ort' strategy. + .../devicetree/bindings/timer/renesas,tmu.yaml | 18 ++++++++++++++++-- + .../bindings/timer/samsung,exynos4210-mct.yaml | 2 ++ + drivers/clocksource/timer-imx-gpt.c | 3 +-- + drivers/clocksource/timer-stm32.c | 4 ++-- + drivers/clocksource/timer-ti-32k.c | 2 +- + 5 files changed, 22 insertions(+), 7 deletions(-) +Merging edac/edac-for-next (5f9d6dfd6c4a Merge ras/edac-amd-atl into for-next) +$ git merge -m Merge branch 'edac-for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/ras/ras.git edac/edac-for-next +Auto-merging MAINTAINERS +Merge made by the 'ort' strategy. + Documentation/RAS/address-translation.rst | 24 + + Documentation/RAS/{ras.rst => error-decoding.rst} | 11 +- + Documentation/RAS/index.rst | 14 + + Documentation/index.rst | 2 +- + MAINTAINERS | 7 + + drivers/edac/Kconfig | 1 + + drivers/edac/amd64_edac.c | 286 +-------- + drivers/edac/synopsys_edac.c | 4 +- + drivers/ras/Kconfig | 1 + + drivers/ras/Makefile | 2 + + drivers/ras/amd/atl/Kconfig | 20 + + drivers/ras/amd/atl/Makefile | 18 + + drivers/ras/amd/atl/access.c | 133 ++++ + drivers/ras/amd/atl/core.c | 225 +++++++ + drivers/ras/amd/atl/dehash.c | 500 +++++++++++++++ + drivers/ras/amd/atl/denormalize.c | 719 ++++++++++++++++++++++ + drivers/ras/amd/atl/internal.h | 305 +++++++++ + drivers/ras/amd/atl/map.c | 682 ++++++++++++++++++++ + drivers/ras/amd/atl/reg_fields.h | 606 ++++++++++++++++++ + drivers/ras/amd/atl/system.c | 284 +++++++++ + drivers/ras/amd/atl/umc.c | 92 +++ + drivers/ras/ras.c | 31 + + include/linux/ras.h | 16 + + 23 files changed, 3694 insertions(+), 289 deletions(-) + create mode 100644 Documentation/RAS/address-translation.rst + rename Documentation/RAS/{ras.rst => error-decoding.rst} (73%) + create mode 100644 Documentation/RAS/index.rst + create mode 100644 drivers/ras/amd/atl/Kconfig + create mode 100644 drivers/ras/amd/atl/Makefile + create mode 100644 drivers/ras/amd/atl/access.c + create mode 100644 drivers/ras/amd/atl/core.c + create mode 100644 drivers/ras/amd/atl/dehash.c + create mode 100644 drivers/ras/amd/atl/denormalize.c + create mode 100644 drivers/ras/amd/atl/internal.h + create mode 100644 drivers/ras/amd/atl/map.c + create mode 100644 drivers/ras/amd/atl/reg_fields.h + create mode 100644 drivers/ras/amd/atl/system.c + create mode 100644 drivers/ras/amd/atl/umc.c +Merging ftrace/for-next (4af12c95cbe8 Merge probes/for-next) +$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace.git ftrace/for-next +Merge made by the 'ort' strategy. +Merging rcu/rcu/next (bc31e6cb27a9 rcu-tasks: Eliminate deadlocks involving do_exit() and RCU tasks) +$ git merge -m Merge branch 'rcu/next' of git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu.git rcu/rcu/next +Auto-merging Documentation/admin-guide/kernel-parameters.rst +Auto-merging Documentation/admin-guide/kernel-parameters.txt +Auto-merging include/linux/sched.h +Auto-merging kernel/fork.c +Merge made by the 'ort' strategy. + Documentation/RCU/checklist.rst | 32 +- + Documentation/RCU/rcu_dereference.rst | 5 +- + Documentation/RCU/whatisRCU.rst | 19 +- + Documentation/admin-guide/kernel-parameters.rst | 1 + + Documentation/admin-guide/kernel-parameters.txt | 489 ++++++++++++------------ + arch/x86/kernel/tsc.c | 2 +- + fs/proc/bootconfig.c | 12 +- + include/linux/hrtimer.h | 4 +- + include/linux/rcu_sync.h | 1 - + include/linux/rcupdate.h | 4 +- + include/linux/sched.h | 2 + + init/init_task.c | 1 + + kernel/context_tracking.c | 4 + + kernel/fork.c | 1 + + kernel/rcu/Kconfig | 13 + + kernel/rcu/rcu.h | 13 +- + kernel/rcu/rcuscale.c | 6 +- + kernel/rcu/rcutorture.c | 13 +- + kernel/rcu/srcutree.c | 24 +- + kernel/rcu/sync.c | 16 - + kernel/rcu/tasks.h | 89 +++-- + kernel/rcu/tree.c | 235 ++++++++---- + kernel/rcu/tree.h | 20 +- + kernel/rcu/tree_exp.h | 83 +--- + kernel/rcu/tree_nocb.h | 69 ++-- + kernel/rcu/tree_plugin.h | 52 +-- + kernel/time/hrtimer.c | 3 + + 27 files changed, 651 insertions(+), 562 deletions(-) +Merging kvm/next (a9ef277488cf x86/kvm: Fix SEV check in sev_map_percpu_data()) +$ git merge -m Merge branch 'next' of git://git.kernel.org/pub/scm/virt/kvm/kvm.git kvm/next +Auto-merging arch/x86/kernel/kvm.c +Merge made by the 'ort' strategy. + arch/riscv/include/uapi/asm/kvm.h | 27 ++++++ + arch/riscv/kvm/vcpu_onereg.c | 54 ++++++++++++ + arch/x86/include/asm/kvm_host.h | 2 + + arch/x86/kernel/kvm.c | 3 +- + arch/x86/kvm/hyperv.c | 50 +++++++++++ + arch/x86/kvm/hyperv.h | 3 + + arch/x86/kvm/x86.c | 7 ++ + tools/testing/selftests/kvm/riscv/get-reg-list.c | 108 +++++++++++++++++++++++ + 8 files changed, 253 insertions(+), 1 deletion(-) +Merging kvm-arm/next (87bbb6a32237 Merge branch kvm-arm64/misc into kvmarm/next) +$ git merge -m Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm.git kvm-arm/next +Merge made by the 'ort' strategy. + tools/testing/selftests/kvm/aarch64/set_id_regs.c | 18 +++++++++++------- + 1 file changed, 11 insertions(+), 7 deletions(-) +Merging kvms390/next (10f7b1dcdfe0 KVM: s390: cpu model: Use proper define for facility mask size) +$ git merge -m Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/kvms390/linux.git kvms390/next +Already up to date. +Merging kvm-ppc/topic/ppc-kvm (180c6b072bf3 KVM: PPC: Book3S HV nestedv2: Do not cancel pending decrementer exception) +$ git merge -m Merge branch 'topic/ppc-kvm' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git kvm-ppc/topic/ppc-kvm +Already up to date. +Merging kvm-riscv/riscv_kvm_next (4d0e8f9a361b KVM: riscv: selftests: Add Zfa extension to get-reg-list test) +$ git merge -m Merge branch 'riscv_kvm_next' of https://github.com/kvm-riscv/linux.git kvm-riscv/riscv_kvm_next +Already up to date. +Merging kvm-x86/next (f0f3b810edda Merge branches 'generic', 'misc', 'pmu' and 'selftests') +$ git merge -m Merge branch 'next' of https://github.com/kvm-x86/linux.git kvm-x86/next +Auto-merging arch/x86/kvm/x86.c +Auto-merging tools/testing/selftests/kvm/riscv/get-reg-list.c +Merge made by the 'ort' strategy. + arch/x86/include/asm/kvm-x86-pmu-ops.h | 3 +- + arch/x86/kvm/emulate.c | 2 +- + arch/x86/kvm/kvm_emulate.h | 2 +- + arch/x86/kvm/pmu.c | 20 +- + arch/x86/kvm/pmu.h | 5 +- + arch/x86/kvm/svm/pmu.c | 17 +- + arch/x86/kvm/vmx/pmu_intel.c | 178 +++--- + arch/x86/kvm/x86.c | 26 +- + tools/testing/selftests/kvm/Makefile | 2 + + tools/testing/selftests/kvm/aarch64/arch_timer.c | 12 +- + tools/testing/selftests/kvm/aarch64/hypercalls.c | 16 +- + .../selftests/kvm/aarch64/page_fault_test.c | 6 +- + tools/testing/selftests/kvm/aarch64/smccc_filter.c | 2 +- + .../selftests/kvm/aarch64/vpmu_counter_access.c | 12 +- + tools/testing/selftests/kvm/demand_paging_test.c | 4 +- + tools/testing/selftests/kvm/dirty_log_perf_test.c | 4 +- + tools/testing/selftests/kvm/dirty_log_test.c | 4 +- + tools/testing/selftests/kvm/get-reg-list.c | 2 +- + tools/testing/selftests/kvm/guest_print_test.c | 8 +- + .../testing/selftests/kvm/hardware_disable_test.c | 6 +- + .../testing/selftests/kvm/include/kvm_util_base.h | 4 + + tools/testing/selftests/kvm/include/test_util.h | 2 + + tools/testing/selftests/kvm/include/x86_64/pmu.h | 97 ++++ + .../selftests/kvm/include/x86_64/processor.h | 150 +++-- + tools/testing/selftests/kvm/kvm_create_max_vcpus.c | 2 +- + tools/testing/selftests/kvm/kvm_page_table_test.c | 4 +- + .../testing/selftests/kvm/lib/aarch64/processor.c | 2 +- + tools/testing/selftests/kvm/lib/aarch64/vgic.c | 4 +- + tools/testing/selftests/kvm/lib/elf.c | 2 +- + tools/testing/selftests/kvm/lib/kvm_util.c | 81 ++- + tools/testing/selftests/kvm/lib/memstress.c | 2 +- + tools/testing/selftests/kvm/lib/riscv/processor.c | 2 +- + tools/testing/selftests/kvm/lib/s390x/processor.c | 2 +- + tools/testing/selftests/kvm/lib/test_util.c | 25 + + tools/testing/selftests/kvm/lib/userfaultfd_util.c | 2 +- + tools/testing/selftests/kvm/lib/x86_64/pmu.c | 31 ++ + tools/testing/selftests/kvm/lib/x86_64/processor.c | 36 +- + tools/testing/selftests/kvm/lib/x86_64/vmx.c | 6 +- + .../kvm/memslot_modification_stress_test.c | 2 +- + tools/testing/selftests/kvm/memslot_perf_test.c | 6 +- + tools/testing/selftests/kvm/riscv/get-reg-list.c | 2 +- + tools/testing/selftests/kvm/rseq_test.c | 4 +- + tools/testing/selftests/kvm/s390x/resets.c | 4 +- + tools/testing/selftests/kvm/s390x/sync_regs_test.c | 20 +- + .../testing/selftests/kvm/set_memory_region_test.c | 6 +- + .../selftests/kvm/system_counter_offset_test.c | 2 +- + tools/testing/selftests/kvm/x86_64/amx_test.c | 6 +- + tools/testing/selftests/kvm/x86_64/cpuid_test.c | 4 +- + .../testing/selftests/kvm/x86_64/flds_emulation.h | 2 +- + tools/testing/selftests/kvm/x86_64/hyperv_clock.c | 5 +- + .../testing/selftests/kvm/x86_64/hyperv_features.c | 9 +- + tools/testing/selftests/kvm/x86_64/hyperv_ipi.c | 2 +- + .../selftests/kvm/x86_64/hyperv_tlb_flush.c | 2 +- + .../testing/selftests/kvm/x86_64/kvm_clock_test.c | 42 +- + .../selftests/kvm/x86_64/nx_huge_pages_test.c | 6 +- + .../selftests/kvm/x86_64/platform_info_test.c | 2 +- + .../selftests/kvm/x86_64/pmu_counters_test.c | 617 +++++++++++++++++++++ + .../selftests/kvm/x86_64/pmu_event_filter_test.c | 145 ++--- + .../selftests/kvm/x86_64/sev_migrate_tests.c | 28 +- + .../kvm/x86_64/smaller_maxphyaddr_emulation_test.c | 6 +- + .../testing/selftests/kvm/x86_64/sync_regs_test.c | 10 +- + .../selftests/kvm/x86_64/ucna_injection_test.c | 8 +- + .../selftests/kvm/x86_64/userspace_io_test.c | 2 +- + .../selftests/kvm/x86_64/userspace_msr_exit_test.c | 29 +- + .../selftests/kvm/x86_64/vmx_apic_access_test.c | 2 +- + .../selftests/kvm/x86_64/vmx_dirty_log_test.c | 16 +- + .../vmx_exception_with_invalid_guest_state.c | 2 +- + .../kvm/x86_64/vmx_nested_tsc_scaling_test.c | 19 +- + .../selftests/kvm/x86_64/vmx_pmu_caps_test.c | 2 +- + .../testing/selftests/kvm/x86_64/xapic_ipi_test.c | 8 +- + .../testing/selftests/kvm/x86_64/xcr0_cpuid_test.c | 2 +- + .../testing/selftests/kvm/x86_64/xen_shinfo_test.c | 36 +- + tools/testing/selftests/kvm/x86_64/xss_msr_test.c | 2 +- + virt/kvm/kvm_main.c | 4 +- + 74 files changed, 1315 insertions(+), 534 deletions(-) + create mode 100644 tools/testing/selftests/kvm/include/x86_64/pmu.h + create mode 100644 tools/testing/selftests/kvm/lib/x86_64/pmu.c + create mode 100644 tools/testing/selftests/kvm/x86_64/pmu_counters_test.c +Merging xen-tip/linux-next (2d2db7d40254 xen/gntdev: Fix the abuse of underlying struct page in DMA-buf import) +$ git merge -m Merge branch 'linux-next' of git://git.kernel.org/pub/scm/linux/kernel/git/xen/tip.git xen-tip/linux-next +Already up to date. +Merging percpu/for-next (2d9ad81ef935 Merge branch 'for-6.8-fixes' into for-next) +$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/dennis/percpu.git percpu/for-next +Merge made by the 'ort' strategy. + arch/riscv/mm/tlbflush.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) +Merging workqueues/for-next (15930da42f89 workqueue: Don't call cpumask_test_cpu() with -1 CPU in wq_update_node_max_active()) +$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/wq.git workqueues/for-next +Merge made by the 'ort' strategy. + include/linux/workqueue.h | 76 +++-- + kernel/workqueue.c | 816 +++++++++++++++++++++++++++++++++++++-------- + tools/workqueue/wq_dump.py | 95 +++++- + 3 files changed, 813 insertions(+), 174 deletions(-) +Merging drivers-x86/for-next (3f399b5d7189 platform/x86: wmi: Use ACPI device name in netlink event) +$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/pdx86/platform-drivers-x86.git drivers-x86/for-next +Auto-merging drivers/platform/x86/wmi.c +Merge made by the 'ort' strategy. + .../admin-guide/laptops/thinkpad-acpi.rst | 7 +++++- + drivers/platform/mellanox/mlxbf-bootctl.c | 14 +++++------ + drivers/platform/x86/Kconfig | 6 ----- + drivers/platform/x86/asus-wmi.c | 1 - + drivers/platform/x86/dell/Kconfig | 3 --- + drivers/platform/x86/dell/dell-laptop.c | 2 -- + drivers/platform/x86/dell/dell-wmi-privacy.c | 1 - + drivers/platform/x86/huawei-wmi.c | 1 - + drivers/platform/x86/silicom-platform.c | 7 +----- + drivers/platform/x86/thinkpad_acpi.c | 29 ++++++++++++++++------ + drivers/platform/x86/wmi.c | 2 +- + 11 files changed, 37 insertions(+), 36 deletions(-) +Merging chrome-platform/for-next (6613476e225e Linux 6.8-rc1) +$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/chrome-platform/linux.git chrome-platform/for-next +Already up to date. +Merging chrome-platform-firmware/for-firmware-next (6613476e225e Linux 6.8-rc1) +$ git merge -m Merge branch 'for-firmware-next' of git://git.kernel.org/pub/scm/linux/kernel/git/chrome-platform/linux.git chrome-platform-firmware/for-firmware-next +Already up to date. +Merging hsi/for-next (fa72d143471d HSI: omap_ssi: Remove usage of the deprecated ida_simple_xx() API) +$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/sre/linux-hsi.git hsi/for-next +Already up to date. +Merging leds-lj/for-leds-next (54602f38551e leds: Make flash and multicolor dependencies unconditional) +$ git merge -m Merge branch 'for-leds-next' of git://git.kernel.org/pub/scm/linux/kernel/git/lee/leds.git leds-lj/for-leds-next +Merge made by the 'ort' strategy. + .../ABI/testing/sysfs-class-led-trigger-netdev | 12 + + .../ABI/testing/sysfs-class-led-trigger-tty | 14 +- + .../devicetree/bindings/leds/leds-qcom-lpg.yaml | 82 ++++- + drivers/leds/Kconfig | 4 +- + drivers/leds/flash/Kconfig | 4 +- + drivers/leds/led-class.c | 6 + + drivers/leds/led-triggers.c | 9 + + drivers/leds/leds-aw200xx.c | 2 +- + drivers/leds/rgb/leds-qcom-lpg.c | 346 +++++++++++++++++++-- + drivers/leds/trigger/ledtrig-audio.c | 2 + + drivers/leds/trigger/ledtrig-default-on.c | 1 + + drivers/leds/trigger/ledtrig-netdev.c | 98 +++++- + drivers/leds/trigger/ledtrig-panic.c | 23 +- + drivers/staging/greybus/Kconfig | 2 +- + drivers/staging/greybus/light.c | 21 -- + include/dt-bindings/leds/common.h | 3 + + include/linux/led-class-flash.h | 24 -- + include/linux/led-class-multicolor.h | 29 -- + include/linux/leds.h | 19 -- + 19 files changed, 541 insertions(+), 160 deletions(-) +$ git reset --hard HEAD^ +HEAD is now at 4b7d6cc566fd Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/pdx86/platform-drivers-x86.git +Merging next-20240125 version of leds-lj +$ git merge -m next-20240125/leds-lj 4289e434c46c8cbd32cf8b67fa7689b3d2ca4361 +Already up to date. +Merging ipmi/for-next (296455ade1fd Merge tag 'char-misc-6.8-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/char-misc) +$ git merge -m Merge branch 'for-next' of git://github.com/cminyard/linux-ipmi.git ipmi/for-next +Already up to date. +Merging driver-core/driver-core-next (f297a3844aa0 driver core: component: fix spellos) +$ git merge -m Merge branch 'driver-core-next' of git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/driver-core.git driver-core/driver-core-next +Auto-merging drivers/base/cpu.c +Merge made by the 'ort' strategy. + drivers/base/component.c | 4 ++-- + drivers/base/cpu.c | 2 +- + fs/kernfs/dir.c | 31 ++++++++++++++++++++----------- + fs/kernfs/file.c | 8 +++++--- + fs/kernfs/kernfs-internal.h | 2 ++ + include/linux/cpu.h | 2 +- + include/linux/kernfs.h | 10 ++++++---- + 7 files changed, 37 insertions(+), 22 deletions(-) +Merging usb/usb-next (f1a27f081c1f usb: typec: qcom-pmic-typec: allow different implementations for the port backend) +$ git merge -m Merge branch 'usb-next' of git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/usb.git usb/usb-next +Auto-merging Documentation/usb/gadget-testing.rst +Auto-merging drivers/usb/core/hub.c +Auto-merging drivers/usb/dwc3/host.c +Auto-merging drivers/usb/host/xhci.h +Auto-merging drivers/usb/typec/tcpm/tcpm.c +Auto-merging tools/testing/selftests/Makefile +Merge made by the 'ort' strategy. + Documentation/ABI/testing/configfs-usb-gadget-ffs | 12 +- + .../devicetree/bindings/usb/ci-hdrc-usb2.yaml | 2 +- + .../devicetree/bindings/usb/fcs,fsa4480.yaml | 12 +- + .../devicetree/bindings/usb/generic-ehci.yaml | 1 + + .../devicetree/bindings/usb/gpio-sbu-mux.yaml | 12 +- + .../devicetree/bindings/usb/ite,it5205.yaml | 72 ++ + .../devicetree/bindings/usb/mediatek,mtu3.yaml | 5 +- + .../devicetree/bindings/usb/nxp,ptn36502.yaml | 12 +- + .../devicetree/bindings/usb/onnn,nb7vpq904m.yaml | 13 +- + .../bindings/usb/qcom,wcd939x-usbss.yaml | 12 +- + .../devicetree/bindings/usb/snps,dwc3.yaml | 7 + + .../devicetree/bindings/usb/usb-nop-xceiv.yaml | 11 +- + .../devicetree/bindings/usb/usb-switch.yaml | 67 + + Documentation/devicetree/bindings/usb/usb.yaml | 2 + + Documentation/usb/gadget-testing.rst | 8 + + drivers/phy/Kconfig | 1 + + drivers/phy/Makefile | 1 + + drivers/phy/phy-core.c | 47 + + drivers/phy/realtek/Kconfig | 32 + + drivers/phy/realtek/Makefile | 3 + + drivers/phy/realtek/phy-rtk-usb2.c | 1312 ++++++++++++++++++++ + drivers/phy/realtek/phy-rtk-usb3.c | 748 +++++++++++ + drivers/usb/core/Kconfig | 17 + + drivers/usb/core/driver.c | 8 +- + drivers/usb/core/hcd.c | 20 +- + drivers/usb/core/hub.c | 29 + + drivers/usb/core/phy.c | 120 ++ + drivers/usb/core/phy.h | 3 + + drivers/usb/dwc3/core.c | 3 + + drivers/usb/dwc3/core.h | 2 + + drivers/usb/dwc3/dwc3-of-simple.c | 3 +- + drivers/usb/dwc3/host.c | 51 + + drivers/usb/gadget/function/f_fs.c | 20 + + drivers/usb/host/ehci-orion.c | 18 +- + drivers/usb/host/xhci-caps.h | 85 ++ + drivers/usb/host/xhci-port.h | 176 +++ + drivers/usb/host/xhci.h | 262 +--- + drivers/usb/mtu3/mtu3_host.c | 30 + + drivers/usb/phy/phy-generic.c | 55 +- + drivers/usb/storage/sddr55.c | 4 +- + drivers/usb/typec/altmodes/displayport.c | 162 ++- + drivers/usb/typec/bus.c | 102 ++ + drivers/usb/typec/class.c | 59 + + drivers/usb/typec/class.h | 1 + + drivers/usb/typec/mux/Kconfig | 10 + + drivers/usb/typec/mux/Makefile | 1 + + drivers/usb/typec/mux/it5205.c | 294 +++++ + drivers/usb/typec/tcpm/fusb302.c | 2 +- + drivers/usb/typec/tcpm/qcom/qcom_pmic_typec.c | 228 +--- + drivers/usb/typec/tcpm/qcom/qcom_pmic_typec.h | 27 + + .../usb/typec/tcpm/qcom/qcom_pmic_typec_pdphy.c | 159 ++- + .../usb/typec/tcpm/qcom/qcom_pmic_typec_pdphy.h | 92 +- + drivers/usb/typec/tcpm/qcom/qcom_pmic_typec_port.c | 290 ++++- + drivers/usb/typec/tcpm/qcom/qcom_pmic_typec_port.h | 172 +-- + drivers/usb/typec/tcpm/tcpci.c | 26 +- + drivers/usb/typec/tcpm/tcpci_maxim.h | 1 + + drivers/usb/typec/tcpm/tcpci_maxim_core.c | 38 +- + drivers/usb/typec/tcpm/tcpm.c | 1028 ++++++++++++--- + drivers/usb/typec/tcpm/wcove.c | 2 +- + drivers/usb/typec/ucsi/ucsi_ccg.c | 92 +- + include/linux/phy/phy.h | 21 + + include/linux/usb/audio-v2.h | 4 +- + include/linux/usb/pd.h | 1 + + include/linux/usb/pd_vdo.h | 8 +- + include/linux/usb/tcpci.h | 13 + + include/linux/usb/tcpm.h | 16 +- + include/linux/usb/typec.h | 7 + + include/linux/usb/typec_altmode.h | 30 + + include/uapi/linux/usb/ch9.h | 2 + + tools/testing/selftests/Makefile | 1 + + tools/testing/selftests/devices/Makefile | 4 + + .../devices/boards/Dell Inc.,XPS 13 9300.yaml | 40 + + .../selftests/devices/boards/google,spherion.yaml | 50 + + tools/testing/selftests/devices/ksft.py | 90 ++ + .../selftests/devices/test_discoverable_devices.py | 318 +++++ + 75 files changed, 5652 insertions(+), 1037 deletions(-) + create mode 100644 Documentation/devicetree/bindings/usb/ite,it5205.yaml + create mode 100644 Documentation/devicetree/bindings/usb/usb-switch.yaml + create mode 100644 drivers/phy/realtek/Kconfig + create mode 100644 drivers/phy/realtek/Makefile + create mode 100644 drivers/phy/realtek/phy-rtk-usb2.c + create mode 100644 drivers/phy/realtek/phy-rtk-usb3.c + create mode 100644 drivers/usb/host/xhci-caps.h + create mode 100644 drivers/usb/host/xhci-port.h + create mode 100644 drivers/usb/typec/mux/it5205.c + create mode 100644 drivers/usb/typec/tcpm/qcom/qcom_pmic_typec.h + create mode 100644 tools/testing/selftests/devices/Makefile + create mode 100644 tools/testing/selftests/devices/boards/Dell Inc.,XPS 13 9300.yaml + create mode 100644 tools/testing/selftests/devices/boards/google,spherion.yaml + create mode 100644 tools/testing/selftests/devices/ksft.py + create mode 100755 tools/testing/selftests/devices/test_discoverable_devices.py +Merging thunderbolt/next (dec6a613574c thunderbolt: Remove usage of the deprecated ida_simple_xx() API) +$ git merge -m Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/westeri/thunderbolt.git thunderbolt/next +Auto-merging drivers/thunderbolt/tb_regs.h +Auto-merging drivers/thunderbolt/usb4.c +Merge made by the 'ort' strategy. + drivers/thunderbolt/domain.c | 11 ++-- + drivers/thunderbolt/icm.c | 2 +- + drivers/thunderbolt/lc.c | 45 ++++++++++++++ + drivers/thunderbolt/nhi.c | 25 +++++--- + drivers/thunderbolt/nvm.c | 4 +- + drivers/thunderbolt/path.c | 13 ++++ + drivers/thunderbolt/switch.c | 138 ++++++++++++++++++++++++++++++++++++------ + drivers/thunderbolt/tb.c | 26 +++++--- + drivers/thunderbolt/tb.h | 7 ++- + drivers/thunderbolt/tb_regs.h | 6 ++ + drivers/thunderbolt/usb4.c | 39 ++++++++++++ + drivers/thunderbolt/xdomain.c | 12 ++-- + 12 files changed, 278 insertions(+), 50 deletions(-) +Merging usb-serial/usb-next (6613476e225e Linux 6.8-rc1) +$ git merge -m Merge branch 'usb-next' of git://git.kernel.org/pub/scm/linux/kernel/git/johan/usb-serial.git usb-serial/usb-next +Already up to date. +Merging tty/tty-next (fccc9d9233f9 tty: serial: uartps: Add rs485 support to uartps driver) +$ git merge -m Merge branch 'tty-next' of git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/tty.git tty/tty-next +Auto-merging drivers/bluetooth/btnxpuart.c +Auto-merging drivers/mfd/rave-sp.c +Auto-merging drivers/net/ethernet/qualcomm/qca_uart.c +Auto-merging drivers/tty/serial/8250/8250_pci1xxxx.c +Auto-merging drivers/tty/serial/max310x.c +Auto-merging drivers/video/fbdev/core/fbcon.c +Merge made by the 'ort' strategy. + .../devicetree/bindings/serial/cdns,uart.yaml | 1 + + .../devicetree/bindings/serial/fsl-lpuart.yaml | 1 + + .../devicetree/bindings/serial/renesas,hscif.yaml | 1 + + .../devicetree/bindings/serial/samsung_uart.yaml | 2 + + Documentation/driver-api/tty/console.rst | 45 ++ + Documentation/driver-api/tty/index.rst | 1 + + arch/m68k/amiga/config.c | 2 +- + arch/m68k/hp300/config.c | 6 +- + drivers/bluetooth/btmtkuart.c | 4 +- + drivers/bluetooth/btnxpuart.c | 4 +- + drivers/bluetooth/hci_serdev.c | 4 +- + drivers/gnss/serial.c | 2 +- + drivers/gnss/sirf.c | 2 +- + drivers/greybus/gb-beagleplay.c | 6 +- + drivers/iio/chemical/pms7003.c | 4 +- + drivers/iio/chemical/scd30_serial.c | 4 +- + drivers/iio/chemical/sps30_serial.c | 4 +- + drivers/iio/imu/bno055/bno055_ser_core.c | 4 +- + drivers/input/keyboard/amikbd.c | 6 +- + drivers/mfd/rave-sp.c | 4 +- + drivers/net/ethernet/qualcomm/qca_uart.c | 2 +- + drivers/nfc/pn533/uart.c | 4 +- + drivers/nfc/s3fwrn5/uart.c | 4 +- + drivers/platform/chrome/cros_ec_uart.c | 4 +- + drivers/platform/surface/aggregator/core.c | 4 +- + drivers/tty/Kconfig | 7 +- + drivers/tty/serdev/serdev-ttyport.c | 10 +- + drivers/tty/serial/8250/8250_pci1xxxx.c | 140 ++++- + drivers/tty/serial/8250/8250_port.c | 7 + + drivers/tty/serial/amba-pl011.c | 82 --- + drivers/tty/serial/fsl_linflexuart.c | 1 - + drivers/tty/serial/max310x.c | 327 +++++----- + drivers/tty/serial/qcom_geni_serial.c | 27 +- + drivers/tty/serial/samsung_tty.c | 251 ++++---- + drivers/tty/serial/serial_txx9.c | 3 +- + drivers/tty/serial/stm32-usart.c | 223 ++++--- + drivers/tty/serial/stm32-usart.h | 38 +- + drivers/tty/serial/xilinx_uartps.c | 236 +++++++- + drivers/tty/vt/Makefile | 4 +- + drivers/tty/vt/selection.c | 43 +- + drivers/tty/vt/vt.c | 659 ++++++++++----------- + drivers/tty/vt/vt_ioctl.c | 6 +- + drivers/video/console/dummycon.c | 38 +- + drivers/video/console/mdacon.c | 43 +- + drivers/video/console/newport_con.c | 69 +-- + drivers/video/console/sticon.c | 79 +-- + drivers/video/console/vgacon.c | 152 +++-- + drivers/video/fbdev/core/bitblit.c | 13 +- + drivers/video/fbdev/core/fbcon.c | 123 ++-- + drivers/video/fbdev/core/fbcon.h | 4 +- + drivers/video/fbdev/core/fbcon_ccw.c | 13 +- + drivers/video/fbdev/core/fbcon_cw.c | 13 +- + drivers/video/fbdev/core/fbcon_ud.c | 13 +- + drivers/video/fbdev/core/tileblit.c | 4 +- + drivers/video/fbdev/tgafb.c | 2 +- + include/linux/console.h | 126 ++-- + include/linux/console_struct.h | 1 - + include/linux/selection.h | 48 +- + include/linux/serdev.h | 8 +- + include/linux/serial_8250.h | 6 + + include/linux/soc/qcom/geni-se.h | 1 + + include/linux/vt_kern.h | 12 +- + include/uapi/linux/fb.h | 8 +- + include/uapi/linux/vesa.h | 18 + + lib/Kconfig.kgdb | 2 +- + sound/drivers/serial-generic.c | 4 +- + 66 files changed, 1654 insertions(+), 1335 deletions(-) + create mode 100644 Documentation/driver-api/tty/console.rst + create mode 100644 include/uapi/linux/vesa.h +Merging char-misc/char-misc-next (390b60f7638a mei: pxp: add dependency on Xe driver) +$ git merge -m Merge branch 'char-misc-next' of git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/char-misc.git char-misc/char-misc-next +Merge made by the 'ort' strategy. + drivers/misc/hpilo.c | 8 ++++---- + drivers/misc/mei/gsc-me.c | 5 +++++ + drivers/misc/mei/hdcp/Kconfig | 2 +- + drivers/misc/mei/hdcp/mei_hdcp.c | 14 ++++++++++++-- + drivers/misc/mei/pxp/Kconfig | 2 +- + drivers/misc/mei/pxp/mei_pxp.c | 14 ++++++++++++-- + 6 files changed, 35 insertions(+), 10 deletions(-) +Merging accel/habanalabs-next (dddb2e526a36 accel/habanalabs: use kcalloc() instead of kzalloc()) +$ git merge -m Merge branch 'habanalabs-next' of git://git.kernel.org/pub/scm/linux/kernel/git/ogabbay/linux.git accel/habanalabs-next +Merge made by the 'ort' strategy. + .../accel/habanalabs/common/command_submission.c | 3 +- + drivers/accel/habanalabs/common/device.c | 49 ++- + drivers/accel/habanalabs/common/habanalabs.h | 33 +- + drivers/accel/habanalabs/common/hw_queue.c | 17 + + drivers/accel/habanalabs/common/mmu/Makefile | 2 +- + drivers/accel/habanalabs/common/mmu/mmu.c | 223 ++++++++++++- + drivers/accel/habanalabs/common/mmu/mmu_v1.c | 354 +++------------------ + drivers/accel/habanalabs/common/mmu/mmu_v2.c | 338 ++++++++++++++++++++ + drivers/accel/habanalabs/gaudi/gaudi.c | 1 + + drivers/accel/habanalabs/gaudi2/gaudi2.c | 258 +++++++++++---- + drivers/accel/habanalabs/gaudi2/gaudi2P.h | 12 +- + drivers/accel/habanalabs/goya/goya_coresight.c | 3 +- + .../habanalabs/include/hw_ip/mmu/mmu_general.h | 2 + + 13 files changed, 899 insertions(+), 396 deletions(-) + create mode 100644 drivers/accel/habanalabs/common/mmu/mmu_v2.c +Merging coresight/next (60e5f23dc5d6 coresight: ultrasoc-smb: Use guards to cleanup) +$ git merge -m Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/coresight/linux.git coresight/next +Already up to date. +Merging fastrpc/for-next (6613476e225e Linux 6.8-rc1) +$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/srini/fastrpc.git fastrpc/for-next +Already up to date. +Merging fpga/for-next (6613476e225e Linux 6.8-rc1) +$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/fpga/linux-fpga.git fpga/for-next +Already up to date. +Merging icc/icc-next (7158ba962f41 Merge branch 'icc-fixes' into icc-next) +$ git merge -m Merge branch 'icc-next' of git://git.kernel.org/pub/scm/linux/kernel/git/djakov/icc.git icc/icc-next +Merge made by the 'ort' strategy. + .../devicetree/bindings/interconnect/qcom,rpm.yaml | 3 + + drivers/interconnect/qcom/Kconfig | 9 + + drivers/interconnect/qcom/Makefile | 2 + + drivers/interconnect/qcom/msm8909.c | 1329 ++++++++++++++++++++ + drivers/interconnect/qcom/sc8180x.c | 1 + + drivers/interconnect/qcom/sm8550.c | 575 +-------- + drivers/interconnect/qcom/sm8550.h | 284 ++--- + drivers/interconnect/qcom/x1e80100.c | 315 ----- + include/dt-bindings/interconnect/qcom,msm8909.h | 93 ++ + .../dt-bindings/interconnect/qcom,x1e80100-rpmh.h | 24 - + 10 files changed, 1560 insertions(+), 1075 deletions(-) + create mode 100644 drivers/interconnect/qcom/msm8909.c + create mode 100644 include/dt-bindings/interconnect/qcom,msm8909.h +Merging iio/togreg (a0295c1bd4a7 iio: frequency: admfm2000: New driver) +$ git merge -m Merge branch 'togreg' of git://git.kernel.org/pub/scm/linux/kernel/git/jic23/iio.git iio/togreg +Auto-merging MAINTAINERS +Auto-merging drivers/iio/industrialio-core.c +Merge made by the 'ort' strategy. + .../devicetree/bindings/iio/adc/adi,ad7380.yaml | 86 ++++ + .../bindings/iio/adc/richtek,rtq6056.yaml | 9 +- + .../bindings/iio/frequency/adi,admfm2000.yaml | 127 +++++ + .../devicetree/bindings/iio/light/ams,as73211.yaml | 7 +- + .../iio/pressure/honeywell,mprls0025pa.yaml | 98 +++- + MAINTAINERS | 24 +- + drivers/iio/accel/Kconfig | 8 +- + drivers/iio/accel/Makefile | 1 + + drivers/iio/accel/bmc150-accel-i2c.c | 2 +- + drivers/iio/accel/bmc150-accel-spi.c | 3 +- + drivers/iio/accel/bmi088-accel-i2c.c | 70 +++ + drivers/iio/accel/da280.c | 66 ++- + drivers/iio/accel/kxcjk-1013.c | 33 +- + drivers/iio/accel/mma9551.c | 4 +- + drivers/iio/accel/mma9553.c | 4 +- + drivers/iio/accel/mxc4005.c | 4 +- + drivers/iio/accel/mxc6255.c | 4 +- + drivers/iio/accel/st_accel_i2c.c | 5 +- + drivers/iio/accel/stk8ba50.c | 4 +- + drivers/iio/adc/Kconfig | 16 + + drivers/iio/adc/Makefile | 1 + + drivers/iio/adc/ad7380.c | 462 +++++++++++++++++++ + drivers/iio/adc/ad_sigma_delta.c | 7 +- + drivers/iio/adc/rtq6056.c | 275 ++++++++++- + drivers/iio/adc/ti-adc108s102.c | 4 +- + drivers/iio/adc/ti-ads1015.c | 2 +- + drivers/iio/buffer/industrialio-buffer-dmaengine.c | 3 +- + .../iio/common/inv_sensors/inv_sensors_timestamp.c | 2 +- + drivers/iio/dummy/iio_dummy_evgen.c | 2 - + drivers/iio/frequency/Kconfig | 10 + + drivers/iio/frequency/Makefile | 1 + + drivers/iio/frequency/admfm2000.c | 282 +++++++++++ + drivers/iio/gyro/bmg160_i2c.c | 4 +- + drivers/iio/health/afe4403.c | 65 +-- + drivers/iio/health/afe4404.c | 65 +-- + drivers/iio/humidity/hts221_i2c.c | 4 +- + drivers/iio/imu/adis16475.c | 8 +- + drivers/iio/imu/adis16480.c | 9 +- + drivers/iio/imu/fxos8700_i2c.c | 3 +- + drivers/iio/imu/fxos8700_spi.c | 3 +- + drivers/iio/imu/kmx61.c | 2 +- + drivers/iio/industrialio-core.c | 4 +- + drivers/iio/light/Kconfig | 5 +- + drivers/iio/light/as73211.c | 142 ++++-- + drivers/iio/light/jsa1212.c | 4 +- + drivers/iio/light/ltr501.c | 3 +- + drivers/iio/light/max44000.c | 6 +- + drivers/iio/light/rpr0521.c | 4 +- + drivers/iio/light/stk3310.c | 4 +- + drivers/iio/light/us5182d.c | 4 +- + drivers/iio/light/vcnl4000.c | 36 +- + drivers/iio/magnetometer/bmc150_magn_i2c.c | 3 +- + drivers/iio/magnetometer/bmc150_magn_spi.c | 3 +- + drivers/iio/magnetometer/mmc35240.c | 4 +- + drivers/iio/potentiometer/max5487.c | 4 +- + drivers/iio/pressure/Kconfig | 14 +- + drivers/iio/pressure/Makefile | 2 + + drivers/iio/pressure/hp206c.c | 6 +- + drivers/iio/pressure/mprls0025pa.c | 313 ++++++------- + drivers/iio/pressure/mprls0025pa.h | 102 ++++ + drivers/iio/pressure/mprls0025pa_i2c.c | 100 ++++ + drivers/iio/pressure/mprls0025pa_spi.c | 92 ++++ + drivers/iio/pressure/st_pressure_i2c.c | 5 +- + drivers/iio/test/Kconfig | 14 + + drivers/iio/test/Makefile | 1 + + drivers/iio/test/iio-test-gts.c | 513 +++++++++++++++++++++ + tools/iio/iio_utils.c | 2 +- + 67 files changed, 2724 insertions(+), 455 deletions(-) + create mode 100644 Documentation/devicetree/bindings/iio/adc/adi,ad7380.yaml + create mode 100644 Documentation/devicetree/bindings/iio/frequency/adi,admfm2000.yaml + create mode 100644 drivers/iio/accel/bmi088-accel-i2c.c + create mode 100644 drivers/iio/adc/ad7380.c + create mode 100644 drivers/iio/frequency/admfm2000.c + create mode 100644 drivers/iio/pressure/mprls0025pa.h + create mode 100644 drivers/iio/pressure/mprls0025pa_i2c.c + create mode 100644 drivers/iio/pressure/mprls0025pa_spi.c + create mode 100644 drivers/iio/test/iio-test-gts.c +Merging phy-next/next (25ee21fc97db phy: qcom: sgmii-eth: move PCS registers to separate header) +$ git merge -m Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/phy/linux-phy.git phy-next/next +Auto-merging drivers/phy/qualcomm/phy-qcom-qmp-usb.c +CONFLICT (content): Merge conflict in drivers/phy/qualcomm/phy-qcom-qmp-usb.c +Resolved 'drivers/phy/qualcomm/phy-qcom-qmp-usb.c' using previous resolution. +Automatic merge failed; fix conflicts and then commit the result. +$ git commit --no-edit -v -a +[master b4386f156a7c] Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/phy/linux-phy.git +$ git diff -M --stat --summary HEAD^.. + .../bindings/phy/qcom,msm8998-qmp-usb3-phy.yaml | 184 ++++ + .../bindings/phy/qcom,sc8280xp-qmp-pcie-phy.yaml | 6 + + .../bindings/phy/qcom,sc8280xp-qmp-ufs-phy.yaml | 2 + + .../phy/qcom,sc8280xp-qmp-usb3-uni-phy.yaml | 22 - + drivers/phy/marvell/phy-armada38x-comphy.c | 7 +- + drivers/phy/qualcomm/Makefile | 2 +- + drivers/phy/qualcomm/phy-qcom-edp.c | 3 +- + drivers/phy/qualcomm/phy-qcom-qmp-combo.c | 109 +- + drivers/phy/qualcomm/phy-qcom-qmp-common.h | 59 + + drivers/phy/qualcomm/phy-qcom-qmp-dp-com-v3.h | 18 + + drivers/phy/qualcomm/phy-qcom-qmp-dp-phy-v3.h | 21 + + drivers/phy/qualcomm/phy-qcom-qmp-dp-phy-v4.h | 19 + + drivers/phy/qualcomm/phy-qcom-qmp-dp-phy-v5.h | 13 + + drivers/phy/qualcomm/phy-qcom-qmp-dp-phy-v6.h | 13 + + drivers/phy/qualcomm/phy-qcom-qmp-dp-phy.h | 62 ++ + drivers/phy/qualcomm/phy-qcom-qmp-pcie-msm8996.c | 70 +- + drivers/phy/qualcomm/phy-qcom-qmp-pcie.c | 288 +++-- + drivers/phy/qualcomm/phy-qcom-qmp-pcs-pcie-v6.h | 2 + + drivers/phy/qualcomm/phy-qcom-qmp-pcs-pcie-v6_20.h | 2 + + drivers/phy/qualcomm/phy-qcom-qmp-pcs-sgmii.h | 20 + + drivers/phy/qualcomm/phy-qcom-qmp-pcs-ufs-v6.h | 2 + + drivers/phy/qualcomm/phy-qcom-qmp-pcs-v6_20.h | 1 + + drivers/phy/qualcomm/phy-qcom-qmp-qserdes-com-v6.h | 2 + + .../qualcomm/phy-qcom-qmp-qserdes-txrx-ufs-v6.h | 8 + + .../phy/qualcomm/phy-qcom-qmp-qserdes-txrx-v6_20.h | 2 + + drivers/phy/qualcomm/phy-qcom-qmp-ufs.c | 242 +++-- + drivers/phy/qualcomm/phy-qcom-qmp-usb-legacy.c | 76 +- + drivers/phy/qualcomm/phy-qcom-qmp-usb.c | 422 +------ + drivers/phy/qualcomm/phy-qcom-qmp-usbc.c | 1149 ++++++++++++++++++++ + drivers/phy/qualcomm/phy-qcom-qmp.h | 101 +- + drivers/phy/qualcomm/phy-qcom-sgmii-eth.c | 417 +++---- + 31 files changed, 2198 insertions(+), 1146 deletions(-) + create mode 100644 Documentation/devicetree/bindings/phy/qcom,msm8998-qmp-usb3-phy.yaml + create mode 100644 drivers/phy/qualcomm/phy-qcom-qmp-common.h + create mode 100644 drivers/phy/qualcomm/phy-qcom-qmp-dp-com-v3.h + create mode 100644 drivers/phy/qualcomm/phy-qcom-qmp-dp-phy-v3.h + create mode 100644 drivers/phy/qualcomm/phy-qcom-qmp-dp-phy-v4.h + create mode 100644 drivers/phy/qualcomm/phy-qcom-qmp-dp-phy-v5.h + create mode 100644 drivers/phy/qualcomm/phy-qcom-qmp-dp-phy-v6.h + create mode 100644 drivers/phy/qualcomm/phy-qcom-qmp-dp-phy.h + create mode 100644 drivers/phy/qualcomm/phy-qcom-qmp-pcs-sgmii.h + create mode 100644 drivers/phy/qualcomm/phy-qcom-qmp-usbc.c +Merging soundwire/next (0707496ff4e4 soundwire: stream: add missing const to Documentation) +$ git merge -m Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/vkoul/soundwire.git soundwire/next +Merge made by the 'ort' strategy. + Documentation/driver-api/soundwire/stream.rst | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) +Merging extcon/extcon-next (7803680964c0 extcon: qcom-spmi-misc: don't use kernel-doc marker for comment) +$ git merge -m Merge branch 'extcon-next' of git://git.kernel.org/pub/scm/linux/kernel/git/chanwoo/extcon.git extcon/extcon-next +Already up to date. +Merging gnss/gnss-next (41bccc98fb79 Linux 6.8-rc2) +$ git merge -m Merge branch 'gnss-next' of git://git.kernel.org/pub/scm/linux/kernel/git/johan/gnss.git gnss/gnss-next +Already up to date. +Merging vfio/next (78f70c02bdbc vfio/virtio: fix virtio-pci dependency) +$ git merge -m Merge branch 'next' of git://github.com/awilliam/linux-vfio.git vfio/next +Already up to date. +Merging w1/for-next (6613476e225e Linux 6.8-rc1) +$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/krzk/linux-w1.git w1/for-next +Already up to date. +Merging spmi/spmi-next (b85ea95d0864 Linux 6.7-rc1) +$ git merge -m Merge branch 'spmi-next' of git://git.kernel.org/pub/scm/linux/kernel/git/sboyd/spmi.git spmi/spmi-next +Already up to date. +Merging staging/staging-next (ce54e9342124 staging: Remove board staging code) +$ git merge -m Merge branch 'staging-next' of git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/staging.git staging/staging-next +Merge made by the 'ort' strategy. + drivers/staging/Kconfig | 4 - + drivers/staging/Makefile | 2 - + drivers/staging/board/Kconfig | 12 - + drivers/staging/board/Makefile | 4 - + drivers/staging/board/TODO | 2 - + drivers/staging/board/armadillo800eva.c | 88 - + drivers/staging/board/board.c | 204 -- + drivers/staging/board/board.h | 46 - + drivers/staging/board/kzm9d.c | 26 - + drivers/staging/emxx_udc/Kconfig | 11 - + drivers/staging/emxx_udc/Makefile | 2 - + drivers/staging/emxx_udc/TODO | 6 - + drivers/staging/emxx_udc/emxx_udc.c | 3223 --------------------- + drivers/staging/emxx_udc/emxx_udc.h | 554 ---- + drivers/staging/fieldbus/anybuss/arcx-anybus.c | 6 +- + drivers/staging/fieldbus/dev_core.c | 6 +- + drivers/staging/greybus/audio_manager.c | 8 +- + drivers/staging/greybus/authentication.c | 6 +- + drivers/staging/greybus/fw-download.c | 7 +- + drivers/staging/greybus/fw-management.c | 20 +- + drivers/staging/greybus/gbphy.c | 8 +- + drivers/staging/greybus/loopback.c | 6 +- + drivers/staging/greybus/raw.c | 6 +- + drivers/staging/greybus/vibrator.c | 6 +- + drivers/staging/rtl8192e/rtl8192e/r8192E_dev.c | 8 +- + drivers/staging/rtl8192e/rtl8192e/rtl_core.c | 33 +- + drivers/staging/rtl8192e/rtl8192e/rtl_wx.c | 2 +- + drivers/staging/rtl8192e/rtl819x_TSProc.c | 6 +- + drivers/staging/rtl8192e/rtllib.h | 26 +- + drivers/staging/rtl8192e/rtllib_rx.c | 6 +- + drivers/staging/rtl8192e/rtllib_softmac.c | 86 +- + drivers/staging/rtl8192e/rtllib_tx.c | 10 +- + drivers/staging/rtl8192e/rtllib_wx.c | 2 +- + drivers/staging/rtl8723bs/core/rtw_ieee80211.c | 4 +- + drivers/staging/rtl8723bs/core/rtw_sta_mgt.c | 3 +- + drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c | 3 +- + drivers/staging/vt6655/card.c | 74 +- + drivers/staging/vt6655/rxtx.h | 1 - + 38 files changed, 168 insertions(+), 4359 deletions(-) + delete mode 100644 drivers/staging/board/Kconfig + delete mode 100644 drivers/staging/board/Makefile + delete mode 100644 drivers/staging/board/TODO + delete mode 100644 drivers/staging/board/armadillo800eva.c + delete mode 100644 drivers/staging/board/board.c + delete mode 100644 drivers/staging/board/board.h + delete mode 100644 drivers/staging/board/kzm9d.c + delete mode 100644 drivers/staging/emxx_udc/Kconfig + delete mode 100644 drivers/staging/emxx_udc/Makefile + delete mode 100644 drivers/staging/emxx_udc/TODO + delete mode 100644 drivers/staging/emxx_udc/emxx_udc.c + delete mode 100644 drivers/staging/emxx_udc/emxx_udc.h +Merging counter-next/counter-next (0b3bbd8f9baf counter: linux/counter.h: fix Excess kernel-doc description warning) +$ git merge -m Merge branch 'counter-next' of git://git.kernel.org/pub/scm/linux/kernel/git/wbg/counter.git counter-next/counter-next +Merge made by the 'ort' strategy. + include/linux/counter.h | 1 - + 1 file changed, 1 deletion(-) +Merging mux/for-next (44c026a73be8 Linux 6.4-rc3) +$ git merge -m Merge branch 'for-next' of https://gitlab.com/peda-linux/mux.git mux/for-next +Already up to date. +Merging dmaengine/next (93bdff7bb83a dmaengine: ti: k3-psil-j721s2: Add entry for CSI2RX) +$ git merge -m Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/vkoul/dmaengine.git dmaengine/next +Merge made by the 'ort' strategy. + .../bindings/dma/allwinner,sun50i-a64-dma.yaml | 12 +- + drivers/dma/Kconfig | 14 +- + drivers/dma/bestcomm/sram.c | 5 - + drivers/dma/pl330.c | 1 + + drivers/dma/ti/k3-psil-j721s2.c | 73 +++++ + drivers/dma/ti/k3-udma-glue.c | 314 +++++++++++++++------ + drivers/dma/xilinx/xilinx_dma.c | 6 + + include/linux/dma/k3-udma-glue.h | 10 + + 8 files changed, 328 insertions(+), 107 deletions(-) +Merging cgroup/for-next (8d4c171f451d docs: cgroup-v1: add missing code-block tags) +$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git cgroup/for-next +Merge made by the 'ort' strategy. + Documentation/admin-guide/cgroup-v1/hugetlb.rst | 20 ++++++++++++-------- + 1 file changed, 12 insertions(+), 8 deletions(-) +Merging scsi/for-next (890d900e7fec Merge branch 'misc' into for-next) +$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/jejb/scsi.git scsi/for-next +Merge made by the 'ort' strategy. + drivers/message/fusion/mptfc.c | 4 +- + drivers/scsi/fnic/fnic_scsi.c | 4 +- + drivers/scsi/hisi_sas/hisi_sas_main.c | 26 +++++-- + drivers/scsi/hisi_sas/hisi_sas_v3_hw.c | 8 ++- + drivers/scsi/ibmvscsi_tgt/ibmvscsi_tgt.c | 18 ++--- + drivers/scsi/megaraid.c | 2 +- + drivers/scsi/mpi3mr/mpi3mr_os.c | 12 +++- + drivers/scsi/mpt3sas/mpt3sas_base.c | 113 ++++++++++++++++++++----------- + drivers/scsi/mpt3sas/mpt3sas_base.h | 8 ++- + drivers/scsi/mpt3sas/mpt3sas_ctl.c | 54 +++++++++++++++ + drivers/scsi/mpt3sas/mpt3sas_ctl.h | 10 +++ + drivers/scsi/mpt3sas/mpt3sas_scsih.c | 1 + + drivers/scsi/scsi_devinfo.c | 6 +- + drivers/scsi/sd.c | 2 +- + drivers/ufs/core/ufs-mcq.c | 12 ++-- + drivers/ufs/core/ufs-sysfs.c | 49 ++++++++++++++ + drivers/ufs/core/ufshcd.c | 68 ++++++++++++++++--- + drivers/ufs/host/ufs-mediatek.c | 90 ++++++++++++++++-------- + drivers/ufs/host/ufs-mediatek.h | 7 +- + drivers/ufs/host/ufs-qcom.c | 28 ++++++-- + include/scsi/scsi_host.h | 6 +- + include/ufs/ufshcd.h | 7 ++ + include/ufs/ufshci.h | 3 + + 23 files changed, 412 insertions(+), 126 deletions(-) +Merging scsi-mkp/for-next (3f90ac7138ed Merge patch series "scsi: Allow scsi_execute users to request retries") +$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/mkp/scsi.git scsi-mkp/for-next +Auto-merging drivers/scsi/scsi_lib.c +Merge made by the 'ort' strategy. + drivers/scsi/3w-9xxx.c | 44 ++-- + drivers/scsi/3w-sas.c | 36 +-- + drivers/scsi/3w-xxxx.c | 44 ++-- + drivers/scsi/53c700.c | 2 +- + drivers/scsi/Kconfig | 9 + + drivers/scsi/aacraid/aachba.c | 6 +- + drivers/scsi/ch.c | 27 ++- + drivers/scsi/device_handler/scsi_dh_hp_sw.c | 49 +++-- + drivers/scsi/device_handler/scsi_dh_rdac.c | 84 +++---- + drivers/scsi/fnic/fnic_attrs.c | 7 +- + drivers/scsi/ibmvscsi/ibmvfc.c | 22 +- + drivers/scsi/ibmvscsi_tgt/ibmvscsi_tgt.c | 6 +- + drivers/scsi/isci/init.c | 2 +- + drivers/scsi/pm8001/pm8001_ctl.c | 6 +- + drivers/scsi/scsi_lib.c | 124 ++++++++++- + drivers/scsi/scsi_lib_test.c | 330 ++++++++++++++++++++++++++++ + drivers/scsi/scsi_scan.c | 105 +++++---- + drivers/scsi/scsi_transport_spi.c | 35 +-- + drivers/scsi/sd.c | 218 +++++++++++------- + drivers/scsi/ses.c | 66 ++++-- + drivers/scsi/sr.c | 38 ++-- + drivers/ufs/core/ufshcd.c | 22 +- + include/scsi/scsi_device.h | 48 ++++ + 23 files changed, 979 insertions(+), 351 deletions(-) + create mode 100644 drivers/scsi/scsi_lib_test.c +Merging vhost/linux-next (f16d65124380 vdpa/mlx5: Add mkey leak detection) +$ git merge -m Merge branch 'linux-next' of git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost.git vhost/linux-next +Already up to date. +Merging rpmsg/for-next (99f59b148871 Merge branches 'rpmsg-next' and 'rproc-next' into for-next) +$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/remoteproc/linux.git rpmsg/for-next +Merge made by the 'ort' strategy. + drivers/remoteproc/remoteproc_virtio.c | 6 +++--- + drivers/remoteproc/stm32_rproc.c | 6 +++--- + drivers/rpmsg/rpmsg_char.c | 12 ++++++------ + drivers/rpmsg/rpmsg_ctrl.c | 12 ++++++------ + 4 files changed, 18 insertions(+), 18 deletions(-) +Merging gpio/for-next (0bb80ecc33a8 Linux 6.6-rc1) +$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/linusw/linux-gpio.git gpio/for-next +Already up to date. +Merging gpio-brgl/gpio/for-next (6933ba529d06 gpio: improve the API contract for setting direction) +$ git merge -m Merge branch 'gpio/for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/brgl/linux.git gpio-brgl/gpio/for-next +Auto-merging Documentation/userspace-api/index.rst +CONFLICT (content): Merge conflict in Documentation/userspace-api/index.rst +Auto-merging MAINTAINERS +Auto-merging drivers/gpio/gpio-eic-sprd.c +Resolved 'Documentation/userspace-api/index.rst' using previous resolution. +Automatic merge failed; fix conflicts and then commit the result. +$ git commit --no-edit -v -a +[master 8b887b8afdb2] Merge branch 'gpio/for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/brgl/linux.git +$ git diff -M --stat --summary HEAD^.. + Documentation/ABI/obsolete/sysfs-gpio | 4 +- + Documentation/ABI/testing/gpio-cdev | 9 +- + Documentation/admin-guide/gpio/gpio-mockup.rst | 8 ++ + Documentation/admin-guide/gpio/index.rst | 6 +- + Documentation/admin-guide/gpio/obsolete.rst | 13 ++ + Documentation/userspace-api/gpio/chardev.rst | 116 ++++++++++++++++ + Documentation/userspace-api/gpio/chardev_v1.rst | 131 ++++++++++++++++++ + Documentation/userspace-api/gpio/error-codes.rst | 79 +++++++++++ + .../userspace-api/gpio/gpio-get-chipinfo-ioctl.rst | 41 ++++++ + .../gpio/gpio-get-lineevent-ioctl.rst | 84 ++++++++++++ + .../gpio/gpio-get-linehandle-ioctl.rst | 125 +++++++++++++++++ + .../userspace-api/gpio/gpio-get-lineinfo-ioctl.rst | 54 ++++++++ + .../gpio/gpio-get-lineinfo-unwatch-ioctl.rst | 49 +++++++ + .../gpio/gpio-get-lineinfo-watch-ioctl.rst | 74 ++++++++++ + .../gpio/gpio-handle-get-line-values-ioctl.rst | 56 ++++++++ + .../gpio/gpio-handle-set-config-ioctl.rst | 63 +++++++++ + .../gpio/gpio-handle-set-line-values-ioctl.rst | 48 +++++++ + .../gpio/gpio-lineevent-data-read.rst | 84 ++++++++++++ + .../gpio/gpio-lineinfo-changed-read.rst | 87 ++++++++++++ + .../userspace-api/gpio/gpio-v2-get-line-ioctl.rst | 152 +++++++++++++++++++++ + .../gpio/gpio-v2-get-lineinfo-ioctl.rst | 50 +++++++ + .../gpio/gpio-v2-get-lineinfo-watch-ioctl.rst | 67 +++++++++ + .../userspace-api/gpio/gpio-v2-line-event-read.rst | 83 +++++++++++ + .../gpio/gpio-v2-line-get-values-ioctl.rst | 51 +++++++ + .../gpio/gpio-v2-line-set-config-ioctl.rst | 58 ++++++++ + .../gpio/gpio-v2-line-set-values-ioctl.rst | 47 +++++++ + .../gpio/gpio-v2-lineinfo-changed-read.rst | 81 +++++++++++ + Documentation/userspace-api/gpio/index.rst | 18 +++ + Documentation/userspace-api/gpio/obsolete.rst | 11 ++ + .../{admin-guide => userspace-api}/gpio/sysfs.rst | 27 ++-- + Documentation/userspace-api/index.rst | 1 + + MAINTAINERS | 1 + + drivers/gpio/gpio-eic-sprd.c | 10 +- + drivers/gpio/gpiolib-cdev.c | 5 - + drivers/gpio/gpiolib-legacy.c | 12 ++ + drivers/gpio/gpiolib-of.c | 2 + + drivers/gpio/gpiolib.c | 93 ++++++------- + include/linux/gpio/driver.h | 22 +-- + include/uapi/linux/gpio.h | 52 +++---- + 39 files changed, 1850 insertions(+), 124 deletions(-) + create mode 100644 Documentation/admin-guide/gpio/obsolete.rst + create mode 100644 Documentation/userspace-api/gpio/chardev.rst + create mode 100644 Documentation/userspace-api/gpio/chardev_v1.rst + create mode 100644 Documentation/userspace-api/gpio/error-codes.rst + create mode 100644 Documentation/userspace-api/gpio/gpio-get-chipinfo-ioctl.rst + create mode 100644 Documentation/userspace-api/gpio/gpio-get-lineevent-ioctl.rst + create mode 100644 Documentation/userspace-api/gpio/gpio-get-linehandle-ioctl.rst + create mode 100644 Documentation/userspace-api/gpio/gpio-get-lineinfo-ioctl.rst + create mode 100644 Documentation/userspace-api/gpio/gpio-get-lineinfo-unwatch-ioctl.rst + create mode 100644 Documentation/userspace-api/gpio/gpio-get-lineinfo-watch-ioctl.rst + create mode 100644 Documentation/userspace-api/gpio/gpio-handle-get-line-values-ioctl.rst + create mode 100644 Documentation/userspace-api/gpio/gpio-handle-set-config-ioctl.rst + create mode 100644 Documentation/userspace-api/gpio/gpio-handle-set-line-values-ioctl.rst + create mode 100644 Documentation/userspace-api/gpio/gpio-lineevent-data-read.rst + create mode 100644 Documentation/userspace-api/gpio/gpio-lineinfo-changed-read.rst + create mode 100644 Documentation/userspace-api/gpio/gpio-v2-get-line-ioctl.rst + create mode 100644 Documentation/userspace-api/gpio/gpio-v2-get-lineinfo-ioctl.rst + create mode 100644 Documentation/userspace-api/gpio/gpio-v2-get-lineinfo-watch-ioctl.rst + create mode 100644 Documentation/userspace-api/gpio/gpio-v2-line-event-read.rst + create mode 100644 Documentation/userspace-api/gpio/gpio-v2-line-get-values-ioctl.rst + create mode 100644 Documentation/userspace-api/gpio/gpio-v2-line-set-config-ioctl.rst + create mode 100644 Documentation/userspace-api/gpio/gpio-v2-line-set-values-ioctl.rst + create mode 100644 Documentation/userspace-api/gpio/gpio-v2-lineinfo-changed-read.rst + create mode 100644 Documentation/userspace-api/gpio/index.rst + create mode 100644 Documentation/userspace-api/gpio/obsolete.rst + rename Documentation/{admin-guide => userspace-api}/gpio/sysfs.rst (89%) +Merging gpio-intel/for-next (6613476e225e Linux 6.8-rc1) +$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/andy/linux-gpio-intel.git gpio-intel/for-next +Already up to date. +Merging pinctrl/for-next (47eed1127d2a dt-bindings: pinctrl: amlogic: narrow regex for unit address to hex numbers) +$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/linusw/linux-pinctrl.git pinctrl/for-next +Merge made by the 'ort' strategy. + .../bindings/pinctrl/amlogic,meson-pinctrl-a1.yaml | 2 +- + .../pinctrl/amlogic,meson-pinctrl-g12a-aobus.yaml | 2 +- + .../amlogic,meson-pinctrl-g12a-periphs.yaml | 2 +- + .../pinctrl/amlogic,meson8-pinctrl-aobus.yaml | 2 +- + .../pinctrl/amlogic,meson8-pinctrl-cbus.yaml | 2 +- + drivers/pinctrl/mediatek/pinctrl-mt7981.c | 24 ++++++++++++++++++++-- + drivers/pinctrl/mediatek/pinctrl-mt7986.c | 2 +- + drivers/pinctrl/pinctrl-st.c | 3 +-- + drivers/pinctrl/pinctrl-zynqmp.c | 8 ++++---- + 9 files changed, 33 insertions(+), 14 deletions(-) +Merging pinctrl-intel/for-next (6613476e225e Linux 6.8-rc1) +$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/pinctrl/intel.git pinctrl-intel/for-next +Already up to date. +Merging pinctrl-renesas/renesas-pinctrl (fea58424e252 pinctrl: renesas: pinctrl-rzg2l: Add the missing port pins P19 to P28) +$ git merge -m Merge branch 'renesas-pinctrl' of git://git.kernel.org/pub/scm/linux/kernel/git/geert/renesas-drivers.git pinctrl-renesas/renesas-pinctrl +Merge made by the 'ort' strategy. + arch/riscv/boot/dts/renesas/r9a07g043f.dtsi | 4 + + drivers/pinctrl/renesas/core.c | 4 +- + drivers/pinctrl/renesas/pfc-r8a779g0.c | 14 ++ + drivers/pinctrl/renesas/pinctrl-rzg2l.c | 309 +++++++++++++++++++++++----- + 4 files changed, 277 insertions(+), 54 deletions(-) +Merging pinctrl-samsung/for-next (6613476e225e Linux 6.8-rc1) +$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/pinctrl/samsung.git pinctrl-samsung/for-next +Already up to date. +Merging pwm/pwm/for-next (979c6fe7e799 dt-bindings: pxa-pwm: Convert to YAML) +$ git merge -m Merge branch 'pwm/for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/ukleinek/linux.git pwm/pwm/for-next +Merge made by the 'ort' strategy. + .../devicetree/bindings/pwm/marvell,pxa-pwm.yaml | 51 ++++++++++++++++++++++ + Documentation/devicetree/bindings/pwm/pxa-pwm.txt | 30 ------------- + drivers/gpu/drm/bridge/ti-sn65dsi86.c | 1 - + drivers/pwm/core.c | 45 +++++-------------- + drivers/pwm/pwm-atmel-hlcdc.c | 2 +- + drivers/pwm/pwm-clps711x.c | 11 ----- + drivers/pwm/pwm-cros-ec.c | 1 - + drivers/pwm/pwm-mediatek.c | 1 - + drivers/pwm/pwm-pxa.c | 4 +- + include/linux/pwm.h | 2 - + 10 files changed, 65 insertions(+), 83 deletions(-) + create mode 100644 Documentation/devicetree/bindings/pwm/marvell,pxa-pwm.yaml + delete mode 100644 Documentation/devicetree/bindings/pwm/pxa-pwm.txt +Merging ktest/for-next (7dc8e24f0e09 ktest: Restore stty setting at first in dodie) +$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/rostedt/linux-ktest.git ktest/for-next +Already up to date. +Merging kselftest/next (6a71770442b5 selftests: livepatch: Test livepatching a heavily called syscall) +$ git merge -m Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/shuah/linux-kselftest.git kselftest/next +Auto-merging Documentation/dev-tools/kselftest.rst +Auto-merging MAINTAINERS +Auto-merging lib/Kconfig.debug +Auto-merging tools/testing/selftests/lib.mk +Auto-merging tools/testing/selftests/livepatch/functions.sh +Merge made by the 'ort' strategy. + Documentation/dev-tools/kselftest.rst | 4 + + MAINTAINERS | 1 - + arch/s390/configs/debug_defconfig | 1 - + arch/s390/configs/defconfig | 1 - + lib/Kconfig.debug | 22 ---- + lib/Makefile | 2 - + lib/livepatch/Makefile | 14 --- + tools/testing/selftests/lib.mk | 26 ++++- + tools/testing/selftests/livepatch/Makefile | 5 +- + tools/testing/selftests/livepatch/README | 25 +++-- + tools/testing/selftests/livepatch/config | 1 - + tools/testing/selftests/livepatch/functions.sh | 34 +++--- + .../testing/selftests/livepatch/test-callbacks.sh | 50 ++++----- + tools/testing/selftests/livepatch/test-ftrace.sh | 6 +- + .../testing/selftests/livepatch/test-livepatch.sh | 10 +- + .../selftests/livepatch/test-shadow-vars.sh | 2 +- + tools/testing/selftests/livepatch/test-state.sh | 18 ++-- + tools/testing/selftests/livepatch/test-syscall.sh | 53 ++++++++++ + tools/testing/selftests/livepatch/test-sysfs.sh | 6 +- + .../selftests/livepatch/test_klp-call_getpid.c | 44 ++++++++ + .../selftests/livepatch/test_modules/Makefile | 20 ++++ + .../test_modules}/test_klp_atomic_replace.c | 0 + .../test_modules}/test_klp_callbacks_busy.c | 0 + .../test_modules}/test_klp_callbacks_demo.c | 0 + .../test_modules}/test_klp_callbacks_demo2.c | 0 + .../test_modules}/test_klp_callbacks_mod.c | 0 + .../livepatch/test_modules}/test_klp_livepatch.c | 0 + .../livepatch/test_modules}/test_klp_shadow_vars.c | 0 + .../livepatch/test_modules}/test_klp_state.c | 0 + .../livepatch/test_modules}/test_klp_state2.c | 0 + .../livepatch/test_modules}/test_klp_state3.c | 0 + .../livepatch/test_modules/test_klp_syscall.c | 116 +++++++++++++++++++++ + 32 files changed, 340 insertions(+), 121 deletions(-) + delete mode 100644 lib/livepatch/Makefile + create mode 100755 tools/testing/selftests/livepatch/test-syscall.sh + create mode 100644 tools/testing/selftests/livepatch/test_klp-call_getpid.c + create mode 100644 tools/testing/selftests/livepatch/test_modules/Makefile + rename {lib/livepatch => tools/testing/selftests/livepatch/test_modules}/test_klp_atomic_replace.c (100%) + rename {lib/livepatch => tools/testing/selftests/livepatch/test_modules}/test_klp_callbacks_busy.c (100%) + rename {lib/livepatch => tools/testing/selftests/livepatch/test_modules}/test_klp_callbacks_demo.c (100%) + rename {lib/livepatch => tools/testing/selftests/livepatch/test_modules}/test_klp_callbacks_demo2.c (100%) + rename {lib/livepatch => tools/testing/selftests/livepatch/test_modules}/test_klp_callbacks_mod.c (100%) + rename {lib/livepatch => tools/testing/selftests/livepatch/test_modules}/test_klp_livepatch.c (100%) + rename {lib/livepatch => tools/testing/selftests/livepatch/test_modules}/test_klp_shadow_vars.c (100%) + rename {lib/livepatch => tools/testing/selftests/livepatch/test_modules}/test_klp_state.c (100%) + rename {lib/livepatch => tools/testing/selftests/livepatch/test_modules}/test_klp_state2.c (100%) + rename {lib/livepatch => tools/testing/selftests/livepatch/test_modules}/test_klp_state3.c (100%) + create mode 100644 tools/testing/selftests/livepatch/test_modules/test_klp_syscall.c +Merging kunit/test (6613476e225e Linux 6.8-rc1) +$ git merge -m Merge branch 'test' of git://git.kernel.org/pub/scm/linux/kernel/git/shuah/linux-kselftest.git kunit/test +Already up to date. +Merging kunit-next/kunit (6613476e225e Linux 6.8-rc1) +$ git merge -m Merge branch 'kunit' of git://git.kernel.org/pub/scm/linux/kernel/git/shuah/linux-kselftest.git kunit-next/kunit +Already up to date. +Merging livepatching/for-next (602bf1830798 Merge branch 'for-6.7' into for-next) +$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/livepatching/livepatching livepatching/for-next +Merge made by the 'ort' strategy. +Merging rtc/rtc-next (14688f1a91e1 rtc: nuvoton: Compatible with NCT3015Y-R and NCT3018Y-R) +$ git merge -m Merge branch 'rtc-next' of git://git.kernel.org/pub/scm/linux/kernel/git/abelloni/linux.git rtc/rtc-next +Already up to date. +Merging nvdimm/libnvdimm-for-next (a085a5eb6594 acpi/nfit: Use sysfs_emit() for all attributes) +$ git merge -m Merge branch 'libnvdimm-for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm.git nvdimm/libnvdimm-for-next +Already up to date. +Merging at24/at24/for-next (6613476e225e Linux 6.8-rc1) +$ git merge -m Merge branch 'at24/for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/brgl/linux.git at24/at24/for-next +Already up to date. +Merging ntb/ntb-next (9341b37ec17a ntb_perf: Fix printk format) +$ git merge -m Merge branch 'ntb-next' of https://github.com/jonmason/ntb.git ntb/ntb-next +Merge made by the 'ort' strategy. + drivers/ntb/hw/intel/ntb_hw_gen1.c | 2 +- + drivers/ntb/test/ntb_perf.c | 2 +- + 2 files changed, 2 insertions(+), 2 deletions(-) +Merging seccomp/for-next/seccomp (0c6f28a84431 selftests/seccomp: user_notification_addfd check nextfd is available) +$ git merge -m Merge branch 'for-next/seccomp' of git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git seccomp/for-next/seccomp +Merge made by the 'ort' strategy. + tools/testing/selftests/seccomp/seccomp_bpf.c | 41 ++++++++++++++++++++------- + 1 file changed, 31 insertions(+), 10 deletions(-) +Merging fsi/next (c5eeb63edac9 fsi: Fix panic on scom file read) +$ git merge -m Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/joel/fsi.git fsi/next +Merge made by the 'ort' strategy. + drivers/fsi/fsi-sbefifo.c | 9 ++++++++- + drivers/fsi/i2cr-scom.c | 11 ++++++++++- + 2 files changed, 18 insertions(+), 2 deletions(-) +Merging slimbus/for-next (04b945e4cf81 slimbus: qcom-ngd-ctrl: Make QMI message rules const) +$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/srini/slimbus.git slimbus/for-next +Merge made by the 'ort' strategy. + drivers/slimbus/qcom-ngd-ctrl.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) +Merging nvmem/for-next (a0cfd5e99782 dt-bindings: nvmem: Convert xlnx,zynqmp-nvmem.txt to yaml) +$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/srini/nvmem.git nvmem/for-next +Merge made by the 'ort' strategy. + Documentation/ABI/testing/sysfs-nvmem-cells | 16 ++++---- + .../bindings/nvmem/layouts/fixed-cell.yaml | 22 +++++------ + .../bindings/nvmem/xlnx,zynqmp-nvmem.txt | 46 ---------------------- + .../bindings/nvmem/xlnx,zynqmp-nvmem.yaml | 42 ++++++++++++++++++++ + drivers/nvmem/core.c | 5 ++- + drivers/nvmem/mtk-efuse.c | 21 +++++++++- + 6 files changed, 83 insertions(+), 69 deletions(-) + delete mode 100644 Documentation/devicetree/bindings/nvmem/xlnx,zynqmp-nvmem.txt + create mode 100644 Documentation/devicetree/bindings/nvmem/xlnx,zynqmp-nvmem.yaml +Merging xarray/main (2a15de80dd0f idr: fix param name in idr_alloc_cyclic() doc) +$ git merge -m Merge branch 'main' of git://git.infradead.org/users/willy/xarray.git xarray/main +Already up to date. +Merging hyperv/hyperv-next (ce9ecca0238b Linux 6.6-rc2) +$ git merge -m Merge branch 'hyperv-next' of git://git.kernel.org/pub/scm/linux/kernel/git/hyperv/linux.git hyperv/hyperv-next +Already up to date. +Merging auxdisplay/auxdisplay (c52391fafcef auxdisplay: img-ascii-lcd: Use device_get_match_data()) +$ git merge -m Merge branch 'auxdisplay' of https://github.com/ojeda/linux.git auxdisplay/auxdisplay +Already up to date. +Merging kgdb/kgdb/for-next (4f41d30cd6dc kdb: Fix a potential buffer overflow in kdb_local()) +$ git merge -m Merge branch 'kgdb/for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/danielt/linux.git kgdb/kgdb/for-next +Already up to date. +Merging hmm/hmm (6613476e225e Linux 6.8-rc1) +$ git merge -m Merge branch 'hmm' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git hmm/hmm +Already up to date. +Merging cfi/cfi/next (06c2afb862f9 Linux 6.5-rc1) +$ git merge -m Merge branch 'cfi/next' of git://git.kernel.org/pub/scm/linux/kernel/git/mtd/linux.git cfi/cfi/next +Already up to date. +Merging mhi/mhi-next (8ddf54a32111 bus: mhi: host: Read PK HASH dynamically) +$ git merge -m Merge branch 'mhi-next' of git://git.kernel.org/pub/scm/linux/kernel/git/mani/mhi.git mhi/mhi-next +Merge made by the 'ort' strategy. + drivers/bus/mhi/host/boot.c | 11 +---------- + drivers/bus/mhi/host/init.c | 17 +++++++++++++---- + drivers/bus/mhi/host/internal.h | 9 ++++++--- + drivers/bus/mhi/host/pm.c | 20 +++++++++++++++++--- + include/linux/mhi.h | 2 -- + 5 files changed, 37 insertions(+), 22 deletions(-) +Merging memblock/for-next (2159bd4e9057 memblock: Return NUMA_NO_NODE instead of -1 to improve code readability) +$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/rppt/memblock.git memblock/for-next +Already up to date. +Merging cxl/next (73bf93edeeea cxl/core: use sysfs_emit() for attr's _show()) +$ git merge -m Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/cxl/cxl.git cxl/next +Already up to date. +Merging zstd/zstd-next (3f832dfb8a8e zstd: fix g_debuglevel export warning) +$ git merge -m Merge branch 'zstd-next' of https://github.com/terrelln/linux.git zstd/zstd-next +Merge made by the 'ort' strategy. + include/linux/zstd.h | 2 +- + include/linux/zstd_errors.h | 23 +- + include/linux/zstd_lib.h | 697 ++++++++-- + lib/zstd/Makefile | 2 +- + lib/zstd/common/allocations.h | 56 + + lib/zstd/common/bits.h | 149 ++ + lib/zstd/common/bitstream.h | 53 +- + lib/zstd/common/compiler.h | 14 +- + lib/zstd/common/cpu.h | 3 +- + lib/zstd/common/debug.c | 5 +- + lib/zstd/common/debug.h | 3 +- + lib/zstd/common/entropy_common.c | 42 +- + lib/zstd/common/error_private.c | 12 +- + lib/zstd/common/error_private.h | 3 +- + lib/zstd/common/fse.h | 89 +- + lib/zstd/common/fse_decompress.c | 94 +- + lib/zstd/common/huf.h | 234 +--- + lib/zstd/common/mem.h | 2 +- + lib/zstd/common/portability_macros.h | 26 +- + lib/zstd/common/zstd_common.c | 38 +- + lib/zstd/common/zstd_deps.h | 16 +- + lib/zstd/common/zstd_internal.h | 99 +- + lib/zstd/compress/clevels.h | 3 +- + lib/zstd/compress/fse_compress.c | 59 +- + lib/zstd/compress/hist.c | 3 +- + lib/zstd/compress/hist.h | 3 +- + lib/zstd/compress/huf_compress.c | 372 +++-- + lib/zstd/compress/zstd_compress.c | 1758 +++++++++++++++++------- + lib/zstd/compress/zstd_compress_internal.h | 333 +++-- + lib/zstd/compress/zstd_compress_literals.c | 155 ++- + lib/zstd/compress/zstd_compress_literals.h | 25 +- + lib/zstd/compress/zstd_compress_sequences.c | 7 +- + lib/zstd/compress/zstd_compress_sequences.h | 3 +- + lib/zstd/compress/zstd_compress_superblock.c | 47 +- + lib/zstd/compress/zstd_compress_superblock.h | 3 +- + lib/zstd/compress/zstd_cwksp.h | 149 +- + lib/zstd/compress/zstd_double_fast.c | 129 +- + lib/zstd/compress/zstd_double_fast.h | 6 +- + lib/zstd/compress/zstd_fast.c | 578 ++++++-- + lib/zstd/compress/zstd_fast.h | 6 +- + lib/zstd/compress/zstd_lazy.c | 518 +++---- + lib/zstd/compress/zstd_lazy.h | 7 +- + lib/zstd/compress/zstd_ldm.c | 11 +- + lib/zstd/compress/zstd_ldm.h | 3 +- + lib/zstd/compress/zstd_ldm_geartab.h | 3 +- + lib/zstd/compress/zstd_opt.c | 187 +-- + lib/zstd/compress/zstd_opt.h | 3 +- + lib/zstd/decompress/huf_decompress.c | 772 +++++++---- + lib/zstd/decompress/zstd_ddict.c | 9 +- + lib/zstd/decompress/zstd_ddict.h | 3 +- + lib/zstd/decompress/zstd_decompress.c | 259 +++- + lib/zstd/decompress/zstd_decompress_block.c | 283 ++-- + lib/zstd/decompress/zstd_decompress_block.h | 8 +- + lib/zstd/decompress/zstd_decompress_internal.h | 7 +- + lib/zstd/decompress_sources.h | 2 +- + lib/zstd/zstd_common_module.c | 5 +- + lib/zstd/zstd_compress_module.c | 2 +- + lib/zstd/zstd_decompress_module.c | 4 +- + 58 files changed, 4791 insertions(+), 2596 deletions(-) + create mode 100644 lib/zstd/common/allocations.h + create mode 100644 lib/zstd/common/bits.h +Merging efi/next (4afa688d7141 efi: memmap: fix kernel-doc warnings) +$ git merge -m Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/efi/efi.git efi/next +Already up to date. +Merging unicode/for-next (367122c529f3 libfs: Attempt exact-match comparison first during casefolded lookup) +$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/krisman/unicode.git unicode/for-next +Auto-merging include/linux/fs.h +Merge made by the 'ort' strategy. + fs/libfs.c | 40 +++++++++++++++++++++++----------------- + fs/overlayfs/params.c | 13 ++++++++++--- + include/linux/fs.h | 9 +++++++++ + 3 files changed, 42 insertions(+), 20 deletions(-) +Merging slab/slab/for-next (7d2ec24bd8a5 Merge branch 'slab/for-6.9/optimize-get-freelist' into slab/for-next) +$ git merge -m Merge branch 'slab/for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/vbabka/slab.git slab/slab/for-next +Auto-merging Documentation/admin-guide/kernel-parameters.txt +Auto-merging mm/slab_common.c +Merge made by the 'ort' strategy. + Documentation/admin-guide/kernel-parameters.txt | 79 +++++++++++-------------- + Documentation/mm/slub.rst | 60 +++++++++---------- + drivers/misc/lkdtm/heap.c | 2 +- + mm/Kconfig.debug | 6 +- + mm/slab.h | 6 +- + mm/slab_common.c | 17 +++--- + mm/slub.c | 75 +++++++++++------------ + 7 files changed, 117 insertions(+), 128 deletions(-) +Merging random/master (615d30064886 Merge tag 'trace-v6.8-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace) +$ git merge -m Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/crng/random.git random/master +Already up to date. +Merging landlock/next (2f8bb71d737c landlock: Document IOCTL support) +$ git merge -m Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/mic/linux.git landlock/next +Merge made by the 'ort' strategy. + Documentation/userspace-api/landlock.rst | 119 +++++- + include/uapi/linux/landlock.h | 58 ++- + samples/landlock/sandboxer.c | 13 +- + security/landlock/.kunitconfig | 4 + + security/landlock/Kconfig | 15 + + security/landlock/common.h | 2 + + security/landlock/fs.c | 410 ++++++++++++++++++++- + security/landlock/fs.h | 2 + + security/landlock/limits.h | 11 +- + security/landlock/ruleset.h | 2 +- + security/landlock/syscalls.c | 19 +- + tools/testing/kunit/configs/all_tests.config | 1 + + tools/testing/selftests/landlock/base_test.c | 2 +- + tools/testing/selftests/landlock/common.h | 88 +++-- + tools/testing/selftests/landlock/fs_test.c | 520 ++++++++++++++++++++++++++- + tools/testing/selftests/landlock/net_test.c | 13 +- + 16 files changed, 1187 insertions(+), 92 deletions(-) + create mode 100644 security/landlock/.kunitconfig +Merging rust/rust-next (f090f0d0eea9 rust: sync: update integer types in CondVar) +$ git merge -m Merge branch 'rust-next' of https://github.com/Rust-for-Linux/linux.git rust/rust-next +Auto-merging Documentation/process/changes.rst +CONFLICT (content): Merge conflict in Documentation/process/changes.rst +Auto-merging scripts/min-tool-version.sh +Resolved 'Documentation/process/changes.rst' using previous resolution. +Automatic merge failed; fix conflicts and then commit the result. +$ git commit --no-edit -v -a +[master a5b16680f994] Merge branch 'rust-next' of https://github.com/Rust-for-Linux/linux.git +$ git diff -M --stat --summary HEAD^.. + Documentation/process/changes.rst | 2 +- + rust/alloc/alloc.rs | 9 +++- + rust/alloc/boxed.rs | 20 +++++--- + rust/alloc/lib.rs | 7 +-- + rust/alloc/raw_vec.rs | 19 +++++-- + rust/alloc/vec/mod.rs | 16 +++--- + rust/bindings/bindings_helper.h | 1 + + rust/kernel/lib.rs | 2 +- + rust/kernel/sync.rs | 2 +- + rust/kernel/sync/condvar.rs | 104 ++++++++++++++++++++++++++++++-------- + rust/kernel/sync/lock.rs | 4 +- + rust/kernel/task.rs | 18 ++++++- + rust/kernel/time.rs | 20 ++++++++ + scripts/min-tool-version.sh | 2 +- + 14 files changed, 177 insertions(+), 49 deletions(-) + create mode 100644 rust/kernel/time.rs +Merging sysctl/sysctl-next (6613476e225e Linux 6.8-rc1) +$ git merge -m Merge branch 'sysctl-next' of git://git.kernel.org/pub/scm/linux/kernel/git/sysctl/sysctl.git sysctl/sysctl-next +Already up to date. +Merging execve/for-next/execve (90383cc07895 exec: Distinguish in_execve from in_exec) +$ git merge -m Merge branch 'for-next/execve' of git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git execve/for-next/execve +Already up to date. +Merging bitmap/bitmap-for-next (071ad962baf5 bitmap: Step down as a reviewer) +$ git merge -m Merge branch 'bitmap-for-next' of https://github.com/norov/linux.git bitmap/bitmap-for-next +Auto-merging MAINTAINERS +Auto-merging arch/x86/kvm/hyperv.c +Auto-merging drivers/block/null_blk/main.c +Auto-merging drivers/infiniband/ulp/rtrs/rtrs-clt.c +Auto-merging drivers/iommu/arm/arm-smmu/arm-smmu.h +Auto-merging drivers/net/ethernet/sfc/rx_common.c +Auto-merging drivers/net/wireless/realtek/rtw88/pci.c +Auto-merging drivers/net/wireless/realtek/rtw89/pci.c +Auto-merging drivers/pci/controller/pci-hyperv.c +Auto-merging drivers/perf/arm_pmuv3.c +Auto-merging drivers/scsi/mpi3mr/mpi3mr_os.c +Auto-merging drivers/scsi/scsi_lib.c +Auto-merging drivers/tty/nozomi.c +Auto-merging drivers/tty/serial/sc16is7xx.c +CONFLICT (content): Merge conflict in drivers/tty/serial/sc16is7xx.c +Auto-merging drivers/usb/class/cdc-acm.c +Auto-merging kernel/sched/sched.h +Auto-merging kernel/watch_queue.c +Auto-merging lib/sbitmap.c +Auto-merging sound/pci/hda/hda_codec.c +Resolved 'drivers/tty/serial/sc16is7xx.c' using previous resolution. +Automatic merge failed; fix conflicts and then commit the result. +$ git commit --no-edit -v -a +[master ac451fd4abca] Merge branch 'bitmap-for-next' of https://github.com/norov/linux.git +$ git diff -M --stat --summary HEAD^.. + MAINTAINERS | 1 - + arch/m68k/include/asm/mmu_context.h | 11 +- + arch/microblaze/include/asm/mmu_context_mm.h | 11 +- + arch/mips/sgi-ip30/ip30-irq.c | 12 +- + arch/powerpc/mm/book3s32/mmu_context.c | 10 +- + arch/powerpc/platforms/pasemi/dma_lib.c | 41 +--- + arch/powerpc/platforms/powernv/pci-sriov.c | 12 +- + arch/sh/boards/mach-x3proto/ilsel.c | 4 +- + arch/sparc/kernel/pci_msi.c | 9 +- + arch/x86/kvm/hyperv.c | 36 ++-- + drivers/block/null_blk/main.c | 39 ++-- + drivers/dma/idxd/perfmon.c | 8 +- + drivers/infiniband/ulp/rtrs/rtrs-clt.c | 15 +- + drivers/iommu/arm/arm-smmu/arm-smmu.h | 10 +- + drivers/iommu/msm_iommu.c | 18 +- + drivers/isdn/mISDN/core.c | 9 +- + drivers/media/radio/radio-shark.c | 5 +- + drivers/media/radio/radio-shark2.c | 5 +- + drivers/media/usb/cx231xx/cx231xx-cards.c | 16 +- + drivers/media/usb/em28xx/em28xx-cards.c | 37 ++-- + drivers/net/ethernet/rocker/rocker_ofdpa.c | 11 +- + drivers/net/ethernet/sfc/rx_common.c | 4 +- + drivers/net/ethernet/sfc/siena/rx_common.c | 4 +- + drivers/net/ethernet/sfc/siena/siena_sriov.c | 14 +- + drivers/net/wireless/ath/ath10k/snoc.c | 9 +- + drivers/net/wireless/realtek/rtw88/pci.c | 5 +- + drivers/net/wireless/realtek/rtw89/pci.c | 5 +- + drivers/pci/controller/pci-hyperv.c | 7 +- + drivers/perf/alibaba_uncore_drw_pmu.c | 10 +- + drivers/perf/arm-cci.c | 24 +-- + drivers/perf/arm-ccn.c | 10 +- + drivers/perf/arm_dmc620_pmu.c | 9 +- + drivers/perf/arm_pmuv3.c | 8 +- + drivers/scsi/mpi3mr/mpi3mr_os.c | 21 +- + drivers/scsi/qedi/qedi_main.c | 9 +- + drivers/scsi/scsi_lib.c | 7 +- + drivers/tty/nozomi.c | 5 +- + drivers/usb/class/cdc-acm.c | 5 +- + include/linux/cpumask.h | 12 ++ + include/linux/find.h | 301 ++++++++++++++++++++++++++- + kernel/sched/sched.h | 14 +- + kernel/watch_queue.c | 6 +- + lib/find_bit.c | 85 ++++++++ + lib/sbitmap.c | 46 +--- + lib/test_bitmap.c | 61 ++++++ + net/bluetooth/cmtp/core.c | 10 +- + net/smc/smc_wr.c | 10 +- + sound/pci/hda/hda_codec.c | 7 +- + sound/usb/caiaq/audio.c | 13 +- + 49 files changed, 629 insertions(+), 412 deletions(-) +Merging hte/for-next (b85ea95d0864 Linux 6.7-rc1) +$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/pateldipen1984/linux.git hte/for-next +Already up to date. +Merging kspp/for-next/kspp (34b82a2fb747 lkdtm/bugs: In lkdtm_HUNG_TASK() use BUG(), not BUG_ON(1)) +$ git merge -m Merge branch 'for-next/kspp' of git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git kspp/for-next/kspp +Merge made by the 'ort' strategy. + drivers/misc/lkdtm/bugs.c | 3 ++- + drivers/misc/lkdtm/core.c | 22 ++++++++++++++-------- + 2 files changed, 16 insertions(+), 9 deletions(-) +Merging kspp-gustavo/for-next/kspp (6613476e225e Linux 6.8-rc1) +$ git merge -m Merge branch 'for-next/kspp' of git://git.kernel.org/pub/scm/linux/kernel/git/gustavoars/linux.git kspp-gustavo/for-next/kspp +Already up to date. +Merging nolibc/nolibc (6613476e225e Linux 6.8-rc1) +$ git merge -m Merge branch 'nolibc' of git://git.kernel.org/pub/scm/linux/kernel/git/shuah/linux-kselftest.git nolibc/nolibc +Already up to date. +Merging tsm/tsm-next (f4738f56d1dc virt: tdx-guest: Add Quote generation support using TSM_REPORTS) +$ git merge -m Merge branch 'tsm-next' of git://git.kernel.org/pub/scm/linux/kernel/git/djbw/linux tsm/tsm-next +Already up to date. +Merging iommufd/for-next (6613476e225e Linux 6.8-rc1) +$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/jgg/iommufd.git iommufd/for-next +Already up to date. +Merging header_cleanup/header_cleanup (5f4c01f1e3c7 spinlock: Fix failing build for PREEMPT_RT) +$ git merge -m Merge branch 'header_cleanup' of https://evilpiepirate.org/git/bcachefs.git header_cleanup/header_cleanup +Already up to date. +$ git am -3 ../patches/0001-Revert-kasan-revert-eviction-of-stack-traces-in-gene.patch +Applying: Revert "kasan: revert eviction of stack traces in generic mode" +$ git am -3 ../patches/0002-Revert-stackdepot-use-variable-size-records-for-non-.patch +Applying: Revert "stackdepot: use variable size records for non-evictable entries" diff --git a/localversion-next b/localversion-next new file mode 100644 index 00000000000000..6d8c883d7eb82d --- /dev/null +++ b/localversion-next @@ -0,0 +1 @@ +-next-20240201 From bac17ffeaa5629eb9d06933574b3b7fc97358572 Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Tue, 2 Jan 2024 11:33:45 -0800 Subject: [PATCH 662/707] drm/msm/a7xx: Fix LLC typo We'd miss actually activating LLC. Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/adreno/a6xx_gpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c index c0bc924cd3025d..4c80f336bbcd4a 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c @@ -2427,7 +2427,7 @@ static int a6xx_gmu_pm_resume(struct msm_gpu *gpu) msm_devfreq_resume(gpu); - adreno_is_a7xx(adreno_gpu) ? a7xx_llc_activate : a6xx_llc_activate(a6xx_gpu); + adreno_is_a7xx(adreno_gpu) ? a7xx_llc_activate(a6xx_gpu) : a6xx_llc_activate(a6xx_gpu); return ret; } From 0c4dfbb410cc840470b981cb666dad4b9c56eb9f Mon Sep 17 00:00:00 2001 From: Srinivas Kandagatla Date: Thu, 18 Aug 2022 13:33:07 +0100 Subject: [PATCH 663/707] ASoC: q6apm-dai: fix period size to be algined to 64 bytes DSP expects the buffers to be aligned to 64bytes, so fix the current sizes where there is a possiblity of getting an unaligned buffers. Signed-off-by: Srinivas Kandagatla (cherry picked from commit 910a758d0340cff90ddb997a94ea269e30180beb) Signed-off-by: Krzysztof Kozlowski --- sound/soc/qcom/qdsp6/q6apm-dai.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/sound/soc/qcom/qdsp6/q6apm-dai.c b/sound/soc/qcom/qdsp6/q6apm-dai.c index 052e40cb38feca..9c6f33cd508a16 100644 --- a/sound/soc/qcom/qdsp6/q6apm-dai.c +++ b/sound/soc/qcom/qdsp6/q6apm-dai.c @@ -19,15 +19,17 @@ #define DRV_NAME "q6apm-dai" #define PLAYBACK_MIN_NUM_PERIODS 2 -#define PLAYBACK_MAX_NUM_PERIODS 8 -#define PLAYBACK_MAX_PERIOD_SIZE 65536 -#define PLAYBACK_MIN_PERIOD_SIZE 128 +#define PLAYBACK_MAX_NUM_PERIODS 4 +#define PLAYBACK_MAX_PERIOD_SIZE 8192 +#define PLAYBACK_MAX_BUF_SIZE 65536 +#define PLAYBACK_MIN_PERIOD_SIZE 8192 #define CAPTURE_MIN_NUM_PERIODS 2 -#define CAPTURE_MAX_NUM_PERIODS 8 -#define CAPTURE_MAX_PERIOD_SIZE 4096 -#define CAPTURE_MIN_PERIOD_SIZE 320 -#define BUFFER_BYTES_MAX (PLAYBACK_MAX_NUM_PERIODS * PLAYBACK_MAX_PERIOD_SIZE) -#define BUFFER_BYTES_MIN (PLAYBACK_MIN_NUM_PERIODS * PLAYBACK_MIN_PERIOD_SIZE) +#define CAPTURE_MAX_NUM_PERIODS 4 +#define CAPTURE_MAX_PERIOD_SIZE 8192 +#define CAPTURE_MAX_BUF_SIZE 65536 +#define CAPTURE_MIN_PERIOD_SIZE 8192 +#define BUFFER_BYTES_MAX PLAYBACK_MAX_BUF_SIZE +#define BUFFER_BYTES_MIN (8192) #define COMPR_PLAYBACK_MAX_FRAGMENT_SIZE (128 * 1024) #define COMPR_PLAYBACK_MAX_NUM_FRAGMENTS (16 * 4) #define COMPR_PLAYBACK_MIN_FRAGMENT_SIZE (8 * 1024) From 081a921f705bd000097fb53efe1db7effaef8fe5 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Thu, 1 Feb 2024 16:55:24 +0100 Subject: [PATCH 664/707] of: provide a cleanup helper for OF nodes Allow to use __free() to automatically put references to OF nodes. Signed-off-by: Bartosz Golaszewski --- include/linux/of.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/linux/of.h b/include/linux/of.h index 331e05918f11f9..5462ed47f25bc1 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -11,6 +11,8 @@ * Updates for SPARC64 by David S. Miller * Derived from PowerPC and Sparc prom.h files by Stephen Rothwell, IBM Corp. */ + +#include #include #include #include @@ -887,6 +889,8 @@ static inline const void *of_device_get_match_data(const struct device *dev) #define of_match_node(_matches, _node) NULL #endif /* CONFIG_OF */ +DEFINE_FREE(of_node, struct device_node *, if (_T) of_node_put(_T)) + /* Default string compare functions, Allow arch asm/prom.h to override */ #if !defined(of_compat_cmp) #define of_compat_cmp(s1, s2, l) strcasecmp((s1), (s2)) From cf0d1217677bf37ba5be94fc2866be5d1e00eb99 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Thu, 1 Feb 2024 16:55:25 +0100 Subject: [PATCH 665/707] arm64: dts: qcom: qrb5165-rb5: model the PMU of the QCA6391 Add a node for the PMU module of the QCA6391 present on the RB5 board. Assign its LDO power outputs to the existing Bluetooth module. Add a node for the PCIe port to sm8250.dtsi and define the WLAN node on it in the board's .dts and also make it consume the power outputs of the PMU. Signed-off-by: Bartosz Golaszewski --- arch/arm64/boot/dts/qcom/qrb5165-rb5.dts | 128 +++++++++++++++++++++-- arch/arm64/boot/dts/qcom/sm8250.dtsi | 10 ++ 2 files changed, 127 insertions(+), 11 deletions(-) diff --git a/arch/arm64/boot/dts/qcom/qrb5165-rb5.dts b/arch/arm64/boot/dts/qcom/qrb5165-rb5.dts index cd0db4f31d4af9..fab5bebafbadc9 100644 --- a/arch/arm64/boot/dts/qcom/qrb5165-rb5.dts +++ b/arch/arm64/boot/dts/qcom/qrb5165-rb5.dts @@ -108,6 +108,87 @@ regulator-always-on; }; + qca6390_pmu: pmu@0 { + compatible = "qcom,qca6390-pmu"; + + pinctrl-names = "default"; + pinctrl-0 = <&bt_en_state>, <&wlan_en_state>; + + vddaon-supply = <&vreg_s6a_0p95>; + vddpmu-supply = <&vreg_s2f_0p95>; + vddrfa1-supply = <&vreg_s2f_0p95>; + vddrfa2-supply = <&vreg_s8c_1p3>; + vddrfa3-supply = <&vreg_s5a_1p9>; + vddpcie1-supply = <&vreg_s8c_1p3>; + vddpcie2-supply = <&vreg_s5a_1p9>; + vddio-supply = <&vreg_s4a_1p8>; + + wlan-enable-gpios = <&tlmm 20 GPIO_ACTIVE_HIGH>; + bt-enable-gpios = <&tlmm 21 GPIO_ACTIVE_HIGH>; + + regulators { + vreg_pmu_rfa_cmn: ldo0 { + regulator-name = "vreg_pmu_rfa_cmn"; + regulator-min-microvolt = <760000>; + regulator-max-microvolt = <840000>; + }; + + vreg_pmu_aon_0p59: ldo1 { + regulator-name = "vreg_pmu_aon_0p59"; + regulator-min-microvolt = <540000>; + regulator-max-microvolt = <840000>; + }; + + vreg_pmu_wlcx_0p8: ldo2 { + regulator_name = "vreg_pmu_wlcx_0p8"; + regulator-min-microvolt = <760000>; + regulator-max-microvolt = <840000>; + }; + + vreg_pmu_wlmx_0p85: ldo3 { + regulator-name = "vreg_pmu_wlmx_0p85"; + regulator-min-microvolt = <810000>; + regulator-max-microvolt = <890000>; + }; + + vreg_pmu_btcmx_0p85: ldo4 { + regulator-name = "vreg_pmu_btcmx_0p85"; + regulator-min-microvolt = <810000>; + regulator-max-microvolt = <890000>; + }; + + vreg_pmu_rfa_0p8: ldo5 { + regulator-name = "vreg_pmu_rfa_0p8"; + regulator-min-microvolt = <760000>; + regulator-max-microvolt = <840000>; + }; + + vreg_pmu_rfa_1p2: ldo6 { + regulator-name = "vreg_pmu_rfa_1p2"; + regulator-min-microvolt = <1187000>; + regulator-max-microvolt = <1313000>; + }; + + vreg_pmu_rfa_1p7: ldo7 { + regulator_name = "vreg_pmu_rfa_1p7"; + regulator-min-microvolt = <1710000>; + regulator-max-microvolt = <1890000>; + }; + + vreg_pmu_pcie_0p9: ldo8 { + regulator_name = "vreg_pmu_pcie_0p9"; + regulator-min-microvolt = <870000>; + regulator-max-microvolt = <970000>; + }; + + vreg_pmu_pcie_1p8: ldo9 { + regulator_name = "vreg_pmu_pcie_1p8"; + regulator-min-microvolt = <1710000>; + regulator-max-microvolt = <1890000>; + }; + }; + }; + thermal-zones { conn-thermal { polling-delay-passive = <0>; @@ -734,6 +815,24 @@ vdda-pll-supply = <&vreg_l9a_1p2>; }; +&pcieport0 { + wifi@0 { + compatible = "pci17cb,1101"; + reg = <0x10000 0x0 0x0 0x0 0x0>; + + vddrfacmn-supply = <&vreg_pmu_rfa_cmn>; + vddaon-supply = <&vreg_pmu_aon_0p59>; + vddwlcx-supply = <&vreg_pmu_wlcx_0p8>; + vddwlmx-supply = <&vreg_pmu_wlmx_0p85>; + vddbtcmx-supply = <&vreg_pmu_btcmx_0p85>; + vddrfa0-supply = <&vreg_pmu_rfa_0p8>; + vddrfa1-supply = <&vreg_pmu_rfa_1p2>; + vddrfa2-supply = <&vreg_pmu_rfa_1p7>; + vddpcie0-supply = <&vreg_pmu_pcie_0p9>; + vddpcie1-supply = <&vreg_pmu_pcie_1p8>; + }; +}; + &pcie1 { status = "okay"; }; @@ -1303,6 +1402,14 @@ function = "gpio"; bias-pull-up; }; + + wlan_en_state: wlan-default-state { + pins = "gpio20"; + function = "gpio"; + drive-strength = <16>; + output-low; + bias-pull-up; + }; }; &uart6 { @@ -1311,17 +1418,16 @@ bluetooth { compatible = "qcom,qca6390-bt"; - pinctrl-names = "default"; - pinctrl-0 = <&bt_en_state>; - - enable-gpios = <&tlmm 21 GPIO_ACTIVE_HIGH>; - - vddio-supply = <&vreg_s4a_1p8>; - vddpmu-supply = <&vreg_s2f_0p95>; - vddaon-supply = <&vreg_s6a_0p95>; - vddrfa0p9-supply = <&vreg_s2f_0p95>; - vddrfa1p3-supply = <&vreg_s8c_1p3>; - vddrfa1p9-supply = <&vreg_s5a_1p9>; + vddrfacmn-supply = <&vreg_pmu_rfa_cmn>; + vddaon-supply = <&vreg_pmu_aon_0p59>; + vddwlcx-supply = <&vreg_pmu_wlcx_0p8>; + vddwlmx-supply = <&vreg_pmu_wlmx_0p85>; + vddbtcmx-supply = <&vreg_pmu_btcmx_0p85>; + vddrfa0-supply = <&vreg_pmu_rfa_0p8>; + vddrfa1-supply = <&vreg_pmu_rfa_1p2>; + vddrfa2-supply = <&vreg_pmu_rfa_1p7>; + vddpcie0-supply = <&vreg_pmu_pcie_0p9>; + vddpcie1-supply = <&vreg_pmu_pcie_1p8>; }; }; diff --git a/arch/arm64/boot/dts/qcom/sm8250.dtsi b/arch/arm64/boot/dts/qcom/sm8250.dtsi index 4d849e98bf9bdc..7cd21d4e727832 100644 --- a/arch/arm64/boot/dts/qcom/sm8250.dtsi +++ b/arch/arm64/boot/dts/qcom/sm8250.dtsi @@ -2203,6 +2203,16 @@ dma-coherent; status = "disabled"; + + pcieport0: pcie@0 { + device_type = "pci"; + reg = <0x0 0x0 0x0 0x0 0x0>; + #address-cells = <3>; + #size-cells = <2>; + ranges; + + bus-range = <0x01 0xff>; + }; }; pcie0_phy: phy@1c06000 { From 2b1a70d9fac3fe0aaf86cbd498efc12394e0d719 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Thu, 1 Feb 2024 16:55:26 +0100 Subject: [PATCH 666/707] power: sequencing: new subsystem Implement the power sequencing subsystem allowing devices to share complex powering-up and down procedures. It's split into the consumer and provider parts but does not implement any new DT bindings so that the actual power sequencing is never revealed in the DT representation. Signed-off-by: Bartosz Golaszewski --- drivers/power/Kconfig | 1 + drivers/power/Makefile | 1 + drivers/power/sequencing/Kconfig | 12 + drivers/power/sequencing/Makefile | 4 + drivers/power/sequencing/core.c | 482 ++++++++++++++++++++++++++++++ include/linux/pwrseq/consumer.h | 53 ++++ include/linux/pwrseq/provider.h | 41 +++ 7 files changed, 594 insertions(+) create mode 100644 drivers/power/sequencing/Kconfig create mode 100644 drivers/power/sequencing/Makefile create mode 100644 drivers/power/sequencing/core.c create mode 100644 include/linux/pwrseq/consumer.h create mode 100644 include/linux/pwrseq/provider.h diff --git a/drivers/power/Kconfig b/drivers/power/Kconfig index 696bf77a70420e..9a8e44ca9ae4a5 100644 --- a/drivers/power/Kconfig +++ b/drivers/power/Kconfig @@ -1,3 +1,4 @@ # SPDX-License-Identifier: GPL-2.0-only source "drivers/power/reset/Kconfig" +source "drivers/power/sequencing/Kconfig" source "drivers/power/supply/Kconfig" diff --git a/drivers/power/Makefile b/drivers/power/Makefile index effbf0377f3218..962a2cd30a51d9 100644 --- a/drivers/power/Makefile +++ b/drivers/power/Makefile @@ -1,3 +1,4 @@ # SPDX-License-Identifier: GPL-2.0-only obj-$(CONFIG_POWER_RESET) += reset/ +obj-$(CONFIG_POWER_SEQUENCING) += sequencing/ obj-$(CONFIG_POWER_SUPPLY) += supply/ diff --git a/drivers/power/sequencing/Kconfig b/drivers/power/sequencing/Kconfig new file mode 100644 index 00000000000000..ba5732b1dbf84c --- /dev/null +++ b/drivers/power/sequencing/Kconfig @@ -0,0 +1,12 @@ +# SPDX-License-Identifier: GPL-2.0-only + +menuconfig POWER_SEQUENCING + tristate "Power Sequencing support" + help + Say Y here to enable the Power Sequencing subsystem. + + This subsystem is designed to control power to devices that share + complex resources and/or require specific power sequences to be run + during power-up. + + If unsure, say no. diff --git a/drivers/power/sequencing/Makefile b/drivers/power/sequencing/Makefile new file mode 100644 index 00000000000000..dcdf8c0c159e3d --- /dev/null +++ b/drivers/power/sequencing/Makefile @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 + +obj-$(CONFIG_POWER_SEQUENCING) += pwrseq-core.o +pwrseq-core-y := core.o diff --git a/drivers/power/sequencing/core.c b/drivers/power/sequencing/core.c new file mode 100644 index 00000000000000..f035caed0e4ec9 --- /dev/null +++ b/drivers/power/sequencing/core.c @@ -0,0 +1,482 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2024 Linaro Ltd. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static DEFINE_IDA(pwrseq_ida); +/* + * Protects the device list on the bus from concurrent modifications but allows + * simultaneous read-only access. + */ +static DECLARE_RWSEM(pwrseq_sem); + +/** + * struct pwrseq_device - Private power sequencing data. + * @dev: Device struct associated with this sequencer. + * @id: Device ID. + * @owner: Prevents removal of active power sequencing providers. + * @pwrup_count: Keeps track of power state change requests. + * @sem: Protects the device against being unregistered while in use. + * @drvdata: Provider driver private data. + * @match: Power sequencer matching callback. + * @power_on: Power-on callback. + * @power_off: Power-off callback. + */ +struct pwrseq_device { + struct device dev; + int id; + struct module *owner; + unsigned int pwrup_count; + struct rw_semaphore dev_sem; + struct mutex state_lock; + void *drvdata; + pwrseq_match_func match; + pwrseq_power_state_func power_on; + pwrseq_power_state_func power_off; +}; + +/** + * struct pwrseq_desc - Wraps access to the pwrseq_device and ensures that one + * user cannot break the reference counting for others. + * @pwrseq: Reference to the power sequencing device. + * @powered_up: Power state set by the holder of the descriptor (not necessarily + * corresponding to the actual power state of the device). + */ +struct pwrseq_desc { + struct pwrseq_device *pwrseq; + bool powered_up; +}; + +static struct pwrseq_device *to_pwrseq_device(struct device *dev) +{ + return container_of(dev, struct pwrseq_device, dev); +} + +static struct pwrseq_device *pwrseq_device_get(struct pwrseq_device *pwrseq) +{ + get_device(&pwrseq->dev); + + return pwrseq; +} + +static void pwrseq_device_put(struct pwrseq_device *pwrseq) +{ + put_device(&pwrseq->dev); +} + +static struct bus_type pwrseq_bus = { + .name = "pwrseq", +}; + +static void pwrseq_release(struct device *dev) +{ + struct pwrseq_device *pwrseq = to_pwrseq_device(dev); + + mutex_destroy(&pwrseq->state_lock); + ida_free(&pwrseq_ida, pwrseq->id); + kfree(pwrseq); +} + +static const struct device_type pwrseq_device_type = { + .name = "power_sequencer", + .release = pwrseq_release, +}; + +/** + * pwrseq_device_register() - Register a new power sequencer. + * @config: Configuration of the new power sequencing device. + * + * The config structure is only used during the call and can be freed after + * the function returns. The config structure *must* have the parent device + * as well as the match(), power_on() and power_off() callbacks registered. + * + * Returns: + * Returns the address of the new pwrseq device or ERR_PTR() on failure. + */ +struct pwrseq_device *pwrseq_device_register(struct pwrseq_config *config) +{ + struct pwrseq_device *pwrseq; + int ret; + + /* + * Power sequencer must have a parent device and at least the power-on, + * power-off and match callbacks. + */ + if (!config->parent || !config->match || !config->power_on || + !config->power_off) + return ERR_PTR(-EINVAL); + + pwrseq = kzalloc(sizeof(*pwrseq), GFP_KERNEL); + if (!pwrseq) + return ERR_PTR(-ENOMEM); + + pwrseq->dev.type = &pwrseq_device_type; + pwrseq->dev.bus = &pwrseq_bus; + pwrseq->dev.parent = config->parent; + device_set_node(&pwrseq->dev, dev_fwnode(config->parent)); + + pwrseq->id = ida_alloc(&pwrseq_ida, GFP_KERNEL); + if (pwrseq->id < 0) { + kfree(pwrseq); + return ERR_PTR(pwrseq->id); + } + + /* + * From this point onwards the device's release() callback is + * responsible for freeing resources. + */ + device_initialize(&pwrseq->dev); + + ret = dev_set_name(&pwrseq->dev, "pwrseq.%d", pwrseq->id); + if (ret) + goto err_put_pwrseq; + + pwrseq->owner = config->owner ?: THIS_MODULE; + pwrseq->drvdata = config->drvdata; + pwrseq->match = config->match; + pwrseq->power_on = config->power_on; + pwrseq->power_off = config->power_off; + + init_rwsem(&pwrseq->dev_sem); + mutex_init(&pwrseq->state_lock); + + scoped_guard(rwsem_write, &pwrseq_sem) { + ret = device_add(&pwrseq->dev); + if (ret) + goto err_put_pwrseq; + } + + return pwrseq; + +err_put_pwrseq: + pwrseq_device_put(pwrseq); + return ERR_PTR(ret); +} +EXPORT_SYMBOL_GPL(pwrseq_device_register); + +/** + * pwrseq_device_unregister() - Unregister the power sequencer. + * @pwrseq: Power sequencer to unregister. + */ +void pwrseq_device_unregister(struct pwrseq_device *pwrseq) +{ + struct device *dev = &pwrseq->dev; + + scoped_guard(mutex, &pwrseq->state_lock) { + WARN_ONCE(pwrseq->pwrup_count > 0, + "%s: UNREGISTERING POWER SEQUENCER WITH ACTIVE USERS\n", + dev_name(&pwrseq->dev)); + + scoped_guard(rwsem_write, &pwrseq_sem) { + scoped_guard(rwsem_write, &pwrseq->dev_sem) + device_del(dev); + } + } + + pwrseq_device_put(pwrseq); +} +EXPORT_SYMBOL_GPL(pwrseq_device_unregister); + +static void devm_pwrseq_device_unregister(void *data) +{ + struct pwrseq_device *pwrseq = data; + + pwrseq_device_unregister(pwrseq); +} + +/** + * devm_pwrseq_device_register() - Managed variant of pwrseq_device_register(). + * @dev: Managing device. + * @config: Configuration of the new power sequencing device. + * + * Returns: + * Returns the address of the new pwrseq device or ERR_PTR() on failure. + */ +struct pwrseq_device * +devm_pwrseq_device_register(struct device *dev, struct pwrseq_config *config) +{ + struct pwrseq_device *pwrseq; + int ret; + + pwrseq = pwrseq_device_register(config); + if (IS_ERR(pwrseq)) + return pwrseq; + + ret = devm_add_action_or_reset(dev, devm_pwrseq_device_unregister, + pwrseq); + if (ret) + return ERR_PTR(ret); + + return pwrseq; +} +EXPORT_SYMBOL_GPL(devm_pwrseq_device_register); + +/** + * pwrseq_device_get_data() - Get the driver private data associated with this + * sequencer. + * @pwrseq: Power sequencer object. + * + * Returns: + * Address of the private driver data. + */ +void *pwrseq_device_get_data(struct pwrseq_device *pwrseq) +{ + return pwrseq->drvdata; +} +EXPORT_SYMBOL_GPL(pwrseq_device_get_data); + +struct pwrseq_match_data { + struct pwrseq_device *matched; + struct device *dev; +}; + +static int pwrseq_match_device(struct device *pwrseq_dev, void *data) +{ + struct pwrseq_device *pwrseq = to_pwrseq_device(pwrseq_dev); + struct pwrseq_match_data *match_data = data; + int ret; + + guard(rwsem_read)(&pwrseq->dev_sem); + if (!device_is_registered(&pwrseq->dev)) + return 0; + + ret = pwrseq->match(pwrseq, match_data->dev); + if (ret <= 0) + return ret; + + match_data->matched = pwrseq; + + return 1; +} + +/** + * pwrseq_get() - Get the power sequencer associated with this device. + * @dev: Device for which to get the sequencer. + * + * Returns: + * New power sequencer descriptor for use by the consumer driver or ERR_PTR() + * on failure. + */ +struct pwrseq_desc *pwrseq_get(struct device *dev) +{ + struct pwrseq_match_data match_data; + struct pwrseq_device *pwrseq; + int ret; + + struct pwrseq_desc *desc __free(kfree) = kzalloc(sizeof(*desc), + GFP_KERNEL); + if (!desc) + return ERR_PTR(-ENOMEM); + + match_data.matched = NULL; + match_data.dev = dev; + + guard(rwsem_read)(&pwrseq_sem); + + ret = bus_for_each_dev(&pwrseq_bus, NULL, &match_data, + pwrseq_match_device); + if (ret < 0) + return ERR_PTR(ret); + if (ret == 0) + /* No device matched. */ + return ERR_PTR(-EPROBE_DEFER); + + pwrseq = match_data.matched; + + if (!try_module_get(pwrseq->owner)) + return ERR_PTR(-EPROBE_DEFER); + + desc->pwrseq = pwrseq_device_get(pwrseq); + + return no_free_ptr(desc); +} +EXPORT_SYMBOL_GPL(pwrseq_get); + +/** + * pwrseq_put() - Release the power sequencer descriptor. + * @desc: Descriptor to release. + */ +void pwrseq_put(struct pwrseq_desc *desc) +{ + struct pwrseq_device *pwrseq; + + if (!desc) + return; + + pwrseq = desc->pwrseq; + + if (desc->powered_up) + pwrseq_power_off(desc); + + kfree(desc); + module_put(pwrseq->owner); + pwrseq_device_put(pwrseq); +} +EXPORT_SYMBOL_GPL(pwrseq_put); + +static void devm_pwrseq_put(void *data) +{ + struct pwrseq_desc *desc = data; + + pwrseq_put(desc); +} + +/** + * devm_pwrseq_get() - Managed variant of pwrseq_get(). + * @dev: Device for which to get the sequencer and which also manages its + * lifetime. + * + * Returns: + * New power sequencer descriptor for use by the consumer driver or ERR_PTR() + * on failure. + */ +struct pwrseq_desc *devm_pwrseq_get(struct device *dev) +{ + struct pwrseq_desc *desc; + int ret; + + desc = pwrseq_get(dev); + if (IS_ERR(desc)) + return desc; + + ret = devm_add_action_or_reset(dev, devm_pwrseq_put, desc); + if (ret) + return ERR_PTR(ret); + + return desc; +} +EXPORT_SYMBOL_GPL(devm_pwrseq_get); + +/** + * pwrseq_power_on() - Issue a power-on request on behalf of the consumer + * device. + * @desc: Descriptor referencing the power sequencer. + * + * This function tells the power sequencer that the consumer wants to be + * powered-up. The sequencer may already have powered-up the device in which + * case the function returns 0. If the power-up sequence is already in + * progress, the function will block until it's done and return 0. If this is + * the first request, the device will be powered up. + * + * Returns: + * 0 on success, negative error number on failure. + */ +int pwrseq_power_on(struct pwrseq_desc *desc) +{ + struct pwrseq_device *pwrseq; + int ret; + + might_sleep(); + + if (!desc || desc->powered_up) + return 0; + + pwrseq = desc->pwrseq; + + guard(rwsem_read)(&pwrseq->dev_sem); + if (!device_is_registered(&pwrseq->dev)) + return -ENODEV; + + guard(mutex)(&pwrseq->state_lock); + + pwrseq->pwrup_count++; + if (pwrseq->pwrup_count != 1) { + desc->powered_up = true; + return 0; + } + + ret = pwrseq->power_on(pwrseq); + if (!ret) + desc->powered_up = true; + + return ret; +} +EXPORT_SYMBOL_GPL(pwrseq_power_on); + +/** + * pwrseq_power_off() - Issue a power-off request on behalf of the consumer + * device. + * @desc: Descriptor referencing the power sequencer. + * + * This undoes the effects of pwrseq_power_on(). It issues a power-off request + * on behalf of the consumer and when the last remaining user does so, the + * power-down sequence will be started. If one is in progress, the function + * will block until it's complete and then return. + * + * Returns: + * 0 on success, negative error number on failure. + */ +int pwrseq_power_off(struct pwrseq_desc *desc) +{ + struct pwrseq_device *pwrseq; + int ret; + + might_sleep(); + + if (!desc || !desc->powered_up) + return 0; + + pwrseq = desc->pwrseq; + + guard(rwsem_read)(&pwrseq->dev_sem); + if (!device_is_registered(&pwrseq->dev)) + return -ENODEV; + + guard(mutex)(&pwrseq->state_lock); + + if (pwrseq->pwrup_count == 0) { + WARN_ONCE(1, "Unmatched power-off\n"); + return -EBUSY; + } + + pwrseq->pwrup_count--; + if (pwrseq->pwrup_count != 0) { + desc->powered_up = false; + return 0; + } + + ret = pwrseq->power_off(pwrseq); + if (!ret) + desc->powered_up = false; + + return ret; +} +EXPORT_SYMBOL_GPL(pwrseq_power_off); + +static int __init pwrseq_init(void) +{ + int ret; + + ret = bus_register(&pwrseq_bus); + if (ret) { + pr_err("Failed to register the power sequencer bus\n"); + return ret; + } + + return 0; +} +subsys_initcall(pwrseq_init); + +static void __exit pwrseq_exit(void) +{ + bus_unregister(&pwrseq_bus); +} +module_exit(pwrseq_exit); + +MODULE_AUTHOR("Bartosz Golaszewski "); +MODULE_DESCRIPTION("Power Sequencing subsystem core"); +MODULE_LICENSE("GPL"); diff --git a/include/linux/pwrseq/consumer.h b/include/linux/pwrseq/consumer.h new file mode 100644 index 00000000000000..f207b8b2864dbe --- /dev/null +++ b/include/linux/pwrseq/consumer.h @@ -0,0 +1,53 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2024 Linaro Ltd. + */ + +#ifndef __POWER_SEQUENCING_CONSUMER_H__ +#define __POWER_SEQUENCING_CONSUMER_H__ + +#include + +struct device; +struct pwrseq_desc; + +#if IS_ENABLED(CONFIG_POWER_SEQUENCING) + +struct pwrseq_desc * __must_check pwrseq_get(struct device *dev); +void pwrseq_put(struct pwrseq_desc *desc); + +struct pwrseq_desc * __must_check devm_pwrseq_get(struct device *dev); + +int pwrseq_power_on(struct pwrseq_desc *desc); +int pwrseq_power_off(struct pwrseq_desc *desc); + +#else /* CONFIG_POWER_SEQUENCING */ + +static inline struct pwrseq_desc * __must_check pwrseq_get(struct device *dev) +{ + return ERR_PTR(-ENOSYS); +} + +static inline void pwrseq_put(struct pwrseq_desc *desc) +{ +} + +static inline struct pwrseq_desc * __must_check +devm_pwrseq_get(struct device *dev) +{ + return ERR_PTR(-ENOSYS); +} + +static inline int pwrseq_power_on(struct pwrseq_desc *desc) +{ + return -ENOSYS; +} + +static inline int pwrseq_power_off(struct pwrseq_desc *desc) +{ + return -ENOSYS; +} + +#endif /* CONFIG_POWER_SEQUENCING */ + +#endif /* __POWER_SEQUENCING_CONSUMER_H__ */ diff --git a/include/linux/pwrseq/provider.h b/include/linux/pwrseq/provider.h new file mode 100644 index 00000000000000..8696a89afa43c0 --- /dev/null +++ b/include/linux/pwrseq/provider.h @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2024 Linaro Ltd. + */ + +#ifndef __POWER_SEQUENCING_PROVIDER_H__ +#define __POWER_SEQUENCING_PROVIDER_H__ + +struct device; +struct module; +struct pwrseq_device; + +typedef int (*pwrseq_power_state_func)(struct pwrseq_device *); +typedef int (*pwrseq_match_func)(struct pwrseq_device *, struct device *); + +/** + * struct pwrseq_config - Configuration used for registering a new provider. + * @parent: Parent device for the sequencer. + * @owner: Module providing this device. + * @drvdata: Private driver data. + * @match: Provider callback used to match the consumer device to the sequencer. + * @power_on: Callback running the power-on sequence. + * @power_off: Callback running the power-off sequence. + */ +struct pwrseq_config { + struct device *parent; + struct module *owner; + void *drvdata; + pwrseq_match_func match; + pwrseq_power_state_func power_on; + pwrseq_power_state_func power_off; +}; + +struct pwrseq_device *pwrseq_device_register(struct pwrseq_config *config); +void pwrseq_device_unregister(struct pwrseq_device *pwrseq); +struct pwrseq_device * +devm_pwrseq_device_register(struct device *dev, struct pwrseq_config *config); + +void *pwrseq_device_get_data(struct pwrseq_device *pwrseq); + +#endif /* __POWER_SEQUENCING_PROVIDER_H__ */ From 1f9afdbf94c63d9031b3948807e88e72e1d09a60 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Thu, 1 Feb 2024 16:55:27 +0100 Subject: [PATCH 667/707] power: pwrseq: add a driver for the QCA6390 PMU module This adds the power sequencing driver for the QCA6390's PMU module. It uses the pwrseq subsystem and knows how to match the sequencer to the consumer device by verifying the relevant properties and DT layout. Signed-off-by: Bartosz Golaszewski --- drivers/power/sequencing/Kconfig | 16 ++ drivers/power/sequencing/Makefile | 2 + drivers/power/sequencing/pwrseq-qca6390.c | 232 ++++++++++++++++++++++ 3 files changed, 250 insertions(+) create mode 100644 drivers/power/sequencing/pwrseq-qca6390.c diff --git a/drivers/power/sequencing/Kconfig b/drivers/power/sequencing/Kconfig index ba5732b1dbf84c..84ddf3b4ae56ad 100644 --- a/drivers/power/sequencing/Kconfig +++ b/drivers/power/sequencing/Kconfig @@ -10,3 +10,19 @@ menuconfig POWER_SEQUENCING during power-up. If unsure, say no. + +if POWER_SEQUENCING + +config POWER_SEQUENCING_QCA6390 + tristate "QCA6390 PMU driver" + default m if ARCH_QCOM + help + Say U here to enable the power sequencing driver for Qualcomm + QCA6390. + + The QCA6390 package contains the BT and WLAN modules whose power + is controlled by the PMU module. As the former two share the power-up + sequence which is executed by the PMU, this driver is needed for + correct power control. + +endif diff --git a/drivers/power/sequencing/Makefile b/drivers/power/sequencing/Makefile index dcdf8c0c159e3d..628345c4e7aecc 100644 --- a/drivers/power/sequencing/Makefile +++ b/drivers/power/sequencing/Makefile @@ -2,3 +2,5 @@ obj-$(CONFIG_POWER_SEQUENCING) += pwrseq-core.o pwrseq-core-y := core.o + +obj-$(CONFIG_POWER_SEQUENCING_QCA6390) += pwrseq-qca6390.o diff --git a/drivers/power/sequencing/pwrseq-qca6390.c b/drivers/power/sequencing/pwrseq-qca6390.c new file mode 100644 index 00000000000000..6c930e3e88ec9a --- /dev/null +++ b/drivers/power/sequencing/pwrseq-qca6390.c @@ -0,0 +1,232 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2024 Linaro Ltd. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct pwrseq_qca6390_vreg { + const char *name; + unsigned int load_uA; +}; + +struct pwrseq_qca6390_pdata { + const struct pwrseq_qca6390_vreg *vregs; + size_t num_vregs; + unsigned int pwup_delay_msec; +}; + +struct pwrseq_qca6390_ctx { + struct pwrseq_device *pwrseq; + struct device_node *of_node; + const struct pwrseq_qca6390_pdata *pdata; + struct regulator_bulk_data *regs; + struct gpio_desc *bt_gpio; + struct gpio_desc *wlan_gpio; +}; + +static const struct pwrseq_qca6390_vreg pwrseq_qca6390_vregs[] = { + { + .name = "vddio", + .load_uA = 20000, + }, + { + .name = "vddaon", + .load_uA = 100000, + }, + { + .name = "vddpmu", + .load_uA = 1250000, + }, + { + .name = "vddpcie1", + .load_uA = 35000, + }, + { + .name = "vddpcie2", + .load_uA = 15000, + }, + { + .name = "vddrfa1", + .load_uA = 200000, + }, + { + .name = "vddrfa2", + .load_uA = 400000, + }, + { + .name = "vddrfa3", + .load_uA = 400000, + }, +}; + +static const struct pwrseq_qca6390_pdata pwrseq_qca6390_of_data = { + .vregs = pwrseq_qca6390_vregs, + .num_vregs = ARRAY_SIZE(pwrseq_qca6390_vregs), + .pwup_delay_msec = 16, +}; + +static int pwrseq_qca6390_power_on(struct pwrseq_device *pwrseq) +{ + struct pwrseq_qca6390_ctx *ctx = pwrseq_device_get_data(pwrseq); + int ret; + + ret = regulator_bulk_enable(ctx->pdata->num_vregs, ctx->regs); + if (ret) + return ret; + + gpiod_set_value_cansleep(ctx->bt_gpio, 1); + gpiod_set_value_cansleep(ctx->wlan_gpio, 1); + + if (ctx->pdata->pwup_delay_msec) + msleep(ctx->pdata->pwup_delay_msec); + + return 0; +} + +static int pwrseq_qca6390_power_off(struct pwrseq_device *pwrseq) +{ + struct pwrseq_qca6390_ctx *ctx = pwrseq_device_get_data(pwrseq); + + gpiod_set_value_cansleep(ctx->bt_gpio, 0); + gpiod_set_value_cansleep(ctx->wlan_gpio, 0); + + return regulator_bulk_disable(ctx->pdata->num_vregs, ctx->regs); +} + +static int pwrseq_qca6390_match(struct pwrseq_device *pwrseq, + struct device *dev) +{ + struct pwrseq_qca6390_ctx *ctx = pwrseq_device_get_data(pwrseq); + struct device_node *dev_node = dev->of_node; + + /* + * The PMU supplies power to the Bluetooth and WLAN modules. both + * consume the PMU AON output so check the presence of the + * 'vddaon-supply' property and whether it leads us to the right + * device. + */ + if (!of_property_present(dev_node, "vddaon-supply")) + return 0; + + struct device_node *reg_node __free(of_node) = + of_parse_phandle(dev_node, "vddaon-supply", 0); + if (!reg_node) + return 0; + + /* + * `reg_node` is the PMU AON regulator, its parent is the `regulators` + * node and finally its grandparent is the PMU device node that we're + * looking for. + */ + if (!reg_node->parent || !reg_node->parent->parent || + reg_node->parent->parent != ctx->of_node) + return 0; + + return 1; +} + +static int pwrseq_qca6390_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct pwrseq_qca6390_ctx *ctx; + struct pwrseq_config config; + int ret, i; + + ctx = devm_kzalloc(dev, sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + ctx->of_node = dev->of_node; + + ctx->pdata = of_device_get_match_data(dev); + if (!ctx->pdata) + return dev_err_probe(dev, -ENODEV, + "Failed to obtain platform data\n"); + + if (ctx->pdata->vregs) { + ctx->regs = devm_kcalloc(dev, ctx->pdata->num_vregs, + sizeof(*ctx->regs), GFP_KERNEL); + if (!ctx->regs) + return -ENOMEM; + + for (i = 0; i < ctx->pdata->num_vregs; i++) + ctx->regs[i].supply = ctx->pdata->vregs[i].name; + + ret = devm_regulator_bulk_get(dev, ctx->pdata->num_vregs, + ctx->regs); + if (ret < 0) + return dev_err_probe(dev, ret, + "Failed to get all regulators\n"); + + for (i = 0; i < ctx->pdata->num_vregs; i++) { + if (!ctx->pdata->vregs[1].load_uA) + continue; + + ret = regulator_set_load(ctx->regs[i].consumer, + ctx->pdata->vregs[i].load_uA); + if (ret) + return dev_err_probe(dev, ret, + "Failed to set vreg load\n"); + } + } + + ctx->bt_gpio = devm_gpiod_get_optional(dev, "bt-enable", GPIOD_OUT_LOW); + if (IS_ERR(ctx->bt_gpio)) + return dev_err_probe(dev, PTR_ERR(ctx->bt_gpio), + "Failed to get the Bluetooth enable GPIO\n"); + + ctx->wlan_gpio = devm_gpiod_get_optional(dev, "wlan-enable", + GPIOD_OUT_LOW); + if (IS_ERR(ctx->wlan_gpio)) + return dev_err_probe(dev, PTR_ERR(ctx->wlan_gpio), + "Failed to get the WLAN enable GPIO\n"); + + memset(&config, 0, sizeof(config)); + + config.parent = dev; + config.owner = THIS_MODULE; + config.drvdata = ctx; + config.match = pwrseq_qca6390_match; + config.power_on = pwrseq_qca6390_power_on; + config.power_off = pwrseq_qca6390_power_off; + + ctx->pwrseq = devm_pwrseq_device_register(dev, &config); + if (IS_ERR(ctx->pwrseq)) + return dev_err_probe(dev, PTR_ERR(ctx->pwrseq), + "Failed to register the power sequencer\n"); + + return 0; +} + +static const struct of_device_id pwrseq_qca6390_of_match[] = { + { + .compatible = "qcom,qca6390-pmu", + .data = &pwrseq_qca6390_of_data, + }, + { } +}; +MODULE_DEVICE_TABLE(of, pwrseq_qca6390_of_match); + +static struct platform_driver pwrseq_qca6390_driver = { + .driver = { + .name = "pwrseq-qca6390", + .of_match_table = pwrseq_qca6390_of_match, + }, + .probe = pwrseq_qca6390_probe, +}; +module_platform_driver(pwrseq_qca6390_driver); + +MODULE_AUTHOR("Bartosz Golaszewski "); +MODULE_DESCRIPTION("QCA6390 PMU power sequencing driver"); +MODULE_LICENSE("GPL"); From b9ed8b5a2c000afaae55811a3a7aacfbf1cf7b16 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Thu, 1 Feb 2024 16:55:28 +0100 Subject: [PATCH 668/707] Bluetooth: qca: use the power sequencer for QCA6390 Use the pwrseq subsystem's consumer API to run the power-up sequence for the Bluetooth module of the QCA6390 package. Signed-off-by: Bartosz Golaszewski --- drivers/bluetooth/hci_qca.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/drivers/bluetooth/hci_qca.c b/drivers/bluetooth/hci_qca.c index 94b8c406f0c0ed..21c306caafbcb3 100644 --- a/drivers/bluetooth/hci_qca.c +++ b/drivers/bluetooth/hci_qca.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -214,6 +215,7 @@ struct qca_power { struct regulator_bulk_data *vreg_bulk; int num_vregs; bool vregs_on; + struct pwrseq_desc *pwrseq; }; struct qca_serdev { @@ -1791,6 +1793,11 @@ static int qca_power_on(struct hci_dev *hdev) ret = qca_regulator_init(hu); break; + case QCA_QCA6390: + qcadev = serdev_device_get_drvdata(hu->serdev); + ret = pwrseq_power_on(qcadev->bt_power->pwrseq); + break; + default: qcadev = serdev_device_get_drvdata(hu->serdev); if (qcadev->bt_en) { @@ -2160,6 +2167,10 @@ static void qca_power_shutdown(struct hci_uart *hu) } break; + case QCA_QCA6390: + pwrseq_power_off(qcadev->bt_power->pwrseq); + break; + default: gpiod_set_value_cansleep(qcadev->bt_en, 0); } @@ -2298,12 +2309,25 @@ static int qca_serdev_probe(struct serdev_device *serdev) case QCA_WCN6750: case QCA_WCN6855: case QCA_WCN7850: + case QCA_QCA6390: qcadev->bt_power = devm_kzalloc(&serdev->dev, sizeof(struct qca_power), GFP_KERNEL); if (!qcadev->bt_power) return -ENOMEM; + break; + default: + break; + } + switch (qcadev->btsoc_type) { + case QCA_WCN3988: + case QCA_WCN3990: + case QCA_WCN3991: + case QCA_WCN3998: + case QCA_WCN6750: + case QCA_WCN6855: + case QCA_WCN7850: qcadev->bt_power->dev = &serdev->dev; err = qca_init_regulators(qcadev->bt_power, data->vregs, data->num_vregs); @@ -2344,6 +2368,12 @@ static int qca_serdev_probe(struct serdev_device *serdev) } break; + case QCA_QCA6390: + qcadev->bt_power->pwrseq = devm_pwrseq_get(&serdev->dev); + if (IS_ERR(qcadev->bt_power->pwrseq)) + return PTR_ERR(qcadev->bt_power->pwrseq); + fallthrough; + default: qcadev->bt_en = devm_gpiod_get_optional(&serdev->dev, "enable", GPIOD_OUT_LOW); From f5e49cd2dcec4fad9a61eef8e34f0cb4ce09f0f3 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Thu, 1 Feb 2024 16:55:29 +0100 Subject: [PATCH 669/707] PCI: create platform devices for child OF nodes of the port node In order to introduce PCI power-sequencing, we need to create platform devices for child nodes of the port node. They will get matched against the pwrseq drivers (if one exists) and then the actual PCI device will reuse the node once it's detected on the bus. Signed-off-by: Bartosz Golaszewski --- drivers/pci/bus.c | 9 ++++++++- drivers/pci/remove.c | 2 ++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/pci/bus.c b/drivers/pci/bus.c index 826b5016a10102..17ab41094c4e3c 100644 --- a/drivers/pci/bus.c +++ b/drivers/pci/bus.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -342,8 +343,14 @@ void pci_bus_add_device(struct pci_dev *dev) */ pcibios_bus_add_device(dev); pci_fixup_device(pci_fixup_final, dev); - if (pci_is_bridge(dev)) + if (pci_is_bridge(dev)) { of_pci_make_dev_node(dev); + retval = of_platform_populate(dev->dev.of_node, NULL, NULL, + &dev->dev); + if (retval) + pci_err(dev, "failed to populate child OF nodes (%d)\n", + retval); + } pci_create_sysfs_dev_files(dev); pci_proc_attach_device(dev); pci_bridge_d3_update(dev); diff --git a/drivers/pci/remove.c b/drivers/pci/remove.c index d749ea8250d656..fc9db2805888ab 100644 --- a/drivers/pci/remove.c +++ b/drivers/pci/remove.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include #include +#include #include "pci.h" static void pci_free_resources(struct pci_dev *dev) @@ -22,6 +23,7 @@ static void pci_stop_dev(struct pci_dev *dev) device_release_driver(&dev->dev); pci_proc_detach_device(dev); pci_remove_sysfs_dev_files(dev); + of_platform_depopulate(&dev->dev); of_pci_remove_node(dev); pci_dev_assign_added(dev, false); From d3688012fd051930c975956e04cd519bbe8fddca Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Thu, 1 Feb 2024 16:55:30 +0100 Subject: [PATCH 670/707] PCI: hold the rescan mutex when scanning for the first time With the introduction of the power sequencing drivers that will be able to trigger the port rescan, we need to hold the rescan mutex during the initial pci_host_probe() too or the two could get in each other's way. Signed-off-by: Bartosz Golaszewski --- drivers/pci/probe.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index b7335be56008f7..957f7afee7ba51 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -3122,7 +3122,9 @@ int pci_host_probe(struct pci_host_bridge *bridge) struct pci_bus *bus, *child; int ret; + pci_lock_rescan_remove(); ret = pci_scan_root_bus_bridge(bridge); + pci_unlock_rescan_remove(); if (ret < 0) { dev_err(bridge->dev.parent, "Scanning root bridge failed"); return ret; From a0c987643e9bfdf48ec9e06640a724a63c5c2096 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Thu, 1 Feb 2024 16:55:31 +0100 Subject: [PATCH 671/707] PCI/pwrctl: add PCI power control core code Some PCI devices must be powered-on before they can be detected on the bus. Introduce a simple framework reusing the existing PCI OF infrastructure. The way this works is: a DT node representing a PCI device connected to the port can be matched against its power control platform driver. If the match succeeds, the driver is responsible for powering-up the device and calling pcie_pwrctl_device_enable() which will trigger a PCI bus rescan as well as subscribe to PCI bus notifications. When the device is detected and created, we'll make it consume the same DT node that the platform device did. When the device is bound, we'll create a device link between it and the parent power control device. Signed-off-by: Bartosz Golaszewski --- drivers/pci/Kconfig | 1 + drivers/pci/Makefile | 1 + drivers/pci/pwrctl/Kconfig | 8 ++++ drivers/pci/pwrctl/Makefile | 3 ++ drivers/pci/pwrctl/core.c | 82 +++++++++++++++++++++++++++++++++++++ include/linux/pci-pwrctl.h | 24 +++++++++++ 6 files changed, 119 insertions(+) create mode 100644 drivers/pci/pwrctl/Kconfig create mode 100644 drivers/pci/pwrctl/Makefile create mode 100644 drivers/pci/pwrctl/core.c create mode 100644 include/linux/pci-pwrctl.h diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig index 74147262625bc2..5b9b84f8774fb0 100644 --- a/drivers/pci/Kconfig +++ b/drivers/pci/Kconfig @@ -291,5 +291,6 @@ source "drivers/pci/hotplug/Kconfig" source "drivers/pci/controller/Kconfig" source "drivers/pci/endpoint/Kconfig" source "drivers/pci/switch/Kconfig" +source "drivers/pci/pwrctl/Kconfig" endif diff --git a/drivers/pci/Makefile b/drivers/pci/Makefile index cc8b4e01e29de5..6ae202d950f8bf 100644 --- a/drivers/pci/Makefile +++ b/drivers/pci/Makefile @@ -9,6 +9,7 @@ obj-$(CONFIG_PCI) += access.o bus.o probe.o host-bridge.o \ obj-$(CONFIG_PCI) += msi/ obj-$(CONFIG_PCI) += pcie/ +obj-$(CONFIG_PCI) += pwrctl/ ifdef CONFIG_PCI obj-$(CONFIG_PROC_FS) += proc.o diff --git a/drivers/pci/pwrctl/Kconfig b/drivers/pci/pwrctl/Kconfig new file mode 100644 index 00000000000000..e2dc5e5d2af1e8 --- /dev/null +++ b/drivers/pci/pwrctl/Kconfig @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: GPL-2.0 + +menu "PCI Power control drivers" + +config PCI_PWRCTL + bool + +endmenu diff --git a/drivers/pci/pwrctl/Makefile b/drivers/pci/pwrctl/Makefile new file mode 100644 index 00000000000000..4381cfbf3f211a --- /dev/null +++ b/drivers/pci/pwrctl/Makefile @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0 + +obj-$(CONFIG_PCI_PWRCTL) += core.o diff --git a/drivers/pci/pwrctl/core.c b/drivers/pci/pwrctl/core.c new file mode 100644 index 00000000000000..312e6fe95c315f --- /dev/null +++ b/drivers/pci/pwrctl/core.c @@ -0,0 +1,82 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2024 Linaro Ltd. + */ + +#include +#include +#include +#include +#include +#include +#include + +static int pci_pwrctl_notify(struct notifier_block *nb, unsigned long action, + void *data) +{ + struct pci_pwrctl *pwrctl = container_of(nb, struct pci_pwrctl, nb); + struct device *dev = data; + + if (dev_fwnode(dev) != dev_fwnode(pwrctl->dev)) + return NOTIFY_DONE; + + switch (action) { + case BUS_NOTIFY_ADD_DEVICE: + device_set_of_node_from_dev(dev, pwrctl->dev); + break; + case BUS_NOTIFY_BOUND_DRIVER: + pwrctl->link = device_link_add(dev, pwrctl->dev, + DL_FLAG_AUTOREMOVE_CONSUMER); + if (!pwrctl->link) + dev_err(pwrctl->dev, "Failed to add device link\n"); + break; + case BUS_NOTIFY_UNBOUND_DRIVER: + device_link_del(pwrctl->link); + break; + } + + return NOTIFY_DONE; +} + +int pci_pwrctl_device_enable(struct pci_pwrctl *pwrctl) +{ + if (!pwrctl->dev) + return -ENODEV; + + pwrctl->nb.notifier_call = pci_pwrctl_notify; + bus_register_notifier(&pci_bus_type, &pwrctl->nb); + + pci_lock_rescan_remove(); + pci_rescan_bus(to_pci_dev(pwrctl->dev->parent)->bus); + pci_unlock_rescan_remove(); + + return 0; +} +EXPORT_SYMBOL_GPL(pci_pwrctl_device_enable); + +void pci_pwrctl_device_disable(struct pci_pwrctl *pwrctl) +{ + bus_unregister_notifier(&pci_bus_type, &pwrctl->nb); +} +EXPORT_SYMBOL_GPL(pci_pwrctl_device_disable); + +static void devm_pci_pwrctl_device_disable(void *data) +{ + struct pci_pwrctl *pwrctl = data; + + pci_pwrctl_device_disable(pwrctl); +} + +int devm_pci_pwrctl_device_enable(struct device *dev, + struct pci_pwrctl *pwrctl) +{ + int ret; + + ret = pci_pwrctl_device_enable(pwrctl); + if (ret) + return ret; + + return devm_add_action_or_reset(dev, devm_pci_pwrctl_device_disable, + pwrctl); +} +EXPORT_SYMBOL_GPL(devm_pci_pwrctl_device_enable); diff --git a/include/linux/pci-pwrctl.h b/include/linux/pci-pwrctl.h new file mode 100644 index 00000000000000..8d16d27cbfebff --- /dev/null +++ b/include/linux/pci-pwrctl.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2024 Linaro Ltd. + */ + +#ifndef __PCI_PWRCTL_H__ +#define __PCI_PWRCTL_H__ + +#include + +struct device; + +struct pci_pwrctl { + struct notifier_block nb; + struct device *dev; + struct device_link *link; +}; + +int pci_pwrctl_device_enable(struct pci_pwrctl *pwrctl); +void pci_pwrctl_device_disable(struct pci_pwrctl *pwrctl); +int devm_pci_pwrctl_device_enable(struct device *dev, + struct pci_pwrctl *pwrctl); + +#endif /* __PCI_PWRCTL_H__ */ From ea458457f1f665494a23f3ced6f3fb692dcfb45b Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Thu, 1 Feb 2024 16:55:32 +0100 Subject: [PATCH 672/707] PCI/pwrctl: add a PCI power control driver for power sequenced devices Add a PCI power control driver that's capable of correctly powering up devices using the power sequencing subsystem. For now we support the ath11k module on QCA6390. Signed-off-by: Bartosz Golaszewski --- drivers/pci/pwrctl/Kconfig | 9 +++ drivers/pci/pwrctl/Makefile | 1 + drivers/pci/pwrctl/pci-pwrctl-pwrseq.c | 83 ++++++++++++++++++++++++++ 3 files changed, 93 insertions(+) create mode 100644 drivers/pci/pwrctl/pci-pwrctl-pwrseq.c diff --git a/drivers/pci/pwrctl/Kconfig b/drivers/pci/pwrctl/Kconfig index e2dc5e5d2af1e8..bca72dc08e79fd 100644 --- a/drivers/pci/pwrctl/Kconfig +++ b/drivers/pci/pwrctl/Kconfig @@ -5,4 +5,13 @@ menu "PCI Power control drivers" config PCI_PWRCTL bool +config PCI_PWRCTL_PWRSEQ + tristate "PCI Power Control driver using the Power Sequencing subsystem" + select POWER_SEQUENCING + select PCI_PWRCTL + default m if (ATH11K_PCI && ARCH_QCOM) + help + Enable support for the PCI power control driver for device + drivers using the Power Sequencing subsystem. + endmenu diff --git a/drivers/pci/pwrctl/Makefile b/drivers/pci/pwrctl/Makefile index 4381cfbf3f211a..919c0f704ee9ea 100644 --- a/drivers/pci/pwrctl/Makefile +++ b/drivers/pci/pwrctl/Makefile @@ -1,3 +1,4 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_PCI_PWRCTL) += core.o +obj-$(CONFIG_PCI_PWRCTL_PWRSEQ) += pci-pwrctl-pwrseq.o diff --git a/drivers/pci/pwrctl/pci-pwrctl-pwrseq.c b/drivers/pci/pwrctl/pci-pwrctl-pwrseq.c new file mode 100644 index 00000000000000..510598c4edc47b --- /dev/null +++ b/drivers/pci/pwrctl/pci-pwrctl-pwrseq.c @@ -0,0 +1,83 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2023-2024 Linaro Ltd. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct pci_pwrctl_pwrseq_data { + struct pci_pwrctl ctx; + struct pwrseq_desc *pwrseq; +}; + +static void devm_pci_pwrctl_pwrseq_power_off(void *data) +{ + struct pwrseq_desc *pwrseq = data; + + pwrseq_power_off(pwrseq); +} + +static int pci_pwrctl_pwrseq_probe(struct platform_device *pdev) +{ + struct pci_pwrctl_pwrseq_data *data; + struct device *dev = &pdev->dev; + int ret; + + data = devm_kzalloc(dev, sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + + data->pwrseq = devm_pwrseq_get(dev); + if (IS_ERR(data->pwrseq)) + return dev_err_probe(dev, PTR_ERR(data->pwrseq), + "Failed to get the power sequencer\n"); + + ret = pwrseq_power_on(data->pwrseq); + if (ret) + return dev_err_probe(dev, ret, + "Failed to power-on the device\n"); + + ret = devm_add_action_or_reset(dev, devm_pci_pwrctl_pwrseq_power_off, + data->pwrseq); + if (ret) + return ret; + + data->ctx.dev = dev; + + ret = devm_pci_pwrctl_device_enable(dev, &data->ctx); + if (ret) + return dev_err_probe(dev, ret, + "Failed to register the pwrctl wrapper\n"); + + return 0; +} + +static const struct of_device_id pci_pwrctl_pwrseq_of_match[] = { + { + /* ATH11K in QCA6390 package. */ + .compatible = "pci17cb,1101", + }, + { } +}; +MODULE_DEVICE_TABLE(of, pci_pwrctl_pwrseq_of_match); + +static struct platform_driver pci_pwrctl_pwrseq_driver = { + .driver = { + .name = "pci-pwrctl-pwrseq", + .of_match_table = pci_pwrctl_pwrseq_of_match, + }, + .probe = pci_pwrctl_pwrseq_probe, +}; +module_platform_driver(pci_pwrctl_pwrseq_driver); + +MODULE_AUTHOR("Bartosz Golaszewski "); +MODULE_DESCRIPTION("Generic PCI Power Control module for power sequenced devices"); +MODULE_LICENSE("GPL"); From 06180b178035251b348a89611a69741ad710ad69 Mon Sep 17 00:00:00 2001 From: Marijn Suijten Date: Wed, 26 Jul 2023 10:17:38 +0200 Subject: [PATCH 673/707] drm/msm/dpu: Hack in 1:1:1 DSC topology --- drivers/gpu/drm/msm/disp/dpu1/dpu_encoder.c | 29 +++++++++------------ 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder.c b/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder.c index 83380bc92a00a9..22cb575d5d124a 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder.c +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder.c @@ -504,9 +504,8 @@ bool dpu_encoder_use_dsc_merge(struct drm_encoder *drm_enc) if (dpu_enc->phys_encs[i]) intf_count++; - /* See dpu_encoder_get_topology, we only support 2:2:1 topology */ if (dpu_enc->dsc) - num_dsc = 2; + num_dsc = 1; return (num_dsc > 0) && (num_dsc > intf_count); } @@ -567,8 +566,8 @@ static struct msm_display_topology dpu_encoder_get_topology( * this is power optimal and can drive up to (including) 4k * screens */ - topology.num_dsc = 2; - topology.num_lm = 2; + topology.num_dsc = 1; + topology.num_lm = 1; topology.num_intf = 1; } @@ -1832,46 +1831,45 @@ static void dpu_encoder_dsc_pipe_cfg(struct dpu_hw_ctl *ctl, static void dpu_encoder_prep_dsc(struct dpu_encoder_virt *dpu_enc, struct drm_dsc_config *dsc) { - /* coding only for 2LM, 2enc, 1 dsc config */ struct dpu_encoder_phys *enc_master = dpu_enc->cur_master; struct dpu_hw_ctl *ctl = enc_master->hw_ctl; struct dpu_hw_dsc *hw_dsc[MAX_CHANNELS_PER_ENC]; struct dpu_hw_pingpong *hw_pp[MAX_CHANNELS_PER_ENC]; int this_frame_slices; int intf_ip_w, enc_ip_w; - int dsc_common_mode; + int dsc_common_mode = 0; int pic_width; u32 initial_lines; int i; + int num_dsc = 0; for (i = 0; i < MAX_CHANNELS_PER_ENC; i++) { hw_pp[i] = dpu_enc->hw_pp[i]; hw_dsc[i] = dpu_enc->hw_dsc[i]; if (!hw_pp[i] || !hw_dsc[i]) { - DPU_ERROR_ENC(dpu_enc, "invalid params for DSC\n"); - return; + break; } + num_dsc++; } dsc_common_mode = 0; pic_width = dsc->pic_width; - dsc_common_mode = DSC_MODE_MULTIPLEX | DSC_MODE_SPLIT_PANEL; + if (num_dsc > 1) + dsc_common_mode |= DSC_MODE_SPLIT_PANEL; + if (dpu_encoder_use_dsc_merge(&dpu_enc->base)) + dsc_common_mode |= DSC_MODE_MULTIPLEX; if (enc_master->intf_mode == INTF_MODE_VIDEO) dsc_common_mode |= DSC_MODE_VIDEO; this_frame_slices = pic_width / dsc->slice_width; intf_ip_w = this_frame_slices * dsc->slice_width; - /* - * dsc merge case: when using 2 encoders for the same stream, - * no. of slices need to be same on both the encoders. - */ - enc_ip_w = intf_ip_w / 2; + enc_ip_w = intf_ip_w / num_dsc; initial_lines = dpu_encoder_dsc_initial_line_calc(dsc, enc_ip_w); - for (i = 0; i < MAX_CHANNELS_PER_ENC; i++) + for (i = 0; i < num_dsc; i++) dpu_encoder_dsc_pipe_cfg(ctl, hw_dsc[i], hw_pp[i], dsc, dsc_common_mode, initial_lines); } @@ -2015,7 +2013,6 @@ static void dpu_encoder_dsc_pipe_clr(struct dpu_hw_ctl *ctl, static void dpu_encoder_unprep_dsc(struct dpu_encoder_virt *dpu_enc) { - /* coding only for 2LM, 2enc, 1 dsc config */ struct dpu_encoder_phys *enc_master = dpu_enc->cur_master; struct dpu_hw_ctl *ctl = enc_master->hw_ctl; struct dpu_hw_dsc *hw_dsc[MAX_CHANNELS_PER_ENC]; From 4beb1292cc46d1410dbebf3d154682d3592eadaf Mon Sep 17 00:00:00 2001 From: Jens Reidel Date: Fri, 10 Nov 2023 22:03:37 +0100 Subject: [PATCH 674/707] arm64: dts: qcom: sm8450: Add clock-names to mdss Signed-off-by: Jens Reidel --- arch/arm64/boot/dts/qcom/sm8450.dtsi | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/arm64/boot/dts/qcom/sm8450.dtsi b/arch/arm64/boot/dts/qcom/sm8450.dtsi index c60ffe917b44b4..ba9f4a32bcd6c9 100644 --- a/arch/arm64/boot/dts/qcom/sm8450.dtsi +++ b/arch/arm64/boot/dts/qcom/sm8450.dtsi @@ -2962,6 +2962,10 @@ <&gcc GCC_DISP_HF_AXI_CLK>, <&gcc GCC_DISP_SF_AXI_CLK>, <&dispcc DISP_CC_MDSS_MDP_CLK>; + clock-names = "iface_clk", + "gcc_bus", + "gcc_nrt_bus", + "branch_clk"; interrupts = ; interrupt-controller; From 84e42363ef18d5a793a373ccf536647252f318b6 Mon Sep 17 00:00:00 2001 From: Jens Reidel Date: Thu, 7 Sep 2023 20:55:48 +0200 Subject: [PATCH 675/707] arm64: configs: Add sm8450 defconfig Signed-off-by: Jens Reidel --- arch/arm64/configs/sm8450.config | 742 +++++++++++++++++++++++++++++++ 1 file changed, 742 insertions(+) create mode 100644 arch/arm64/configs/sm8450.config diff --git a/arch/arm64/configs/sm8450.config b/arch/arm64/configs/sm8450.config new file mode 100644 index 00000000000000..1f336f1f3b46f9 --- /dev/null +++ b/arch/arm64/configs/sm8450.config @@ -0,0 +1,742 @@ +# Qualcomm Snapdragon SM8450 config fragment +CONFIG_LOCALVERSION="-sm8450" +# CONFIG_LOCALVERSION_AUTO is not set + +# Common for SM8450 devices +# nothing yet + +# Xiaomi 12 (Cupid) +CONFIG_DRM_PANEL_XIAOMI_42_02_0A=y + +# SOC +CONFIG_NR_CPUS=8 +CONFIG_SCHED_CLUSTER=y +CONFIG_SCSI_UFS_QCOM=y +CONFIG_QCOM_LLCC=y +CONFIG_QCOM_OCMEM=y +CONFIG_QCOM_RMTFS_MEM=y +CONFIG_QCOM_SOCINFO=y +CONFIG_QCOM_APR=y +CONFIG_POWER_RESET_QCOM_PON=y +CONFIG_QCOM_SPMI_TEMP_ALARM=y +CONFIG_QCOM_SPMI_ADC_TM5=y +CONFIG_QCOM_GPI_DMA=y + +# SM8450 SOC +CONFIG_INTERCONNECT_QCOM_OSM_L3=y +CONFIG_INTERCONNECT_QCOM_SM8450=y +CONFIG_PINCTRL_LPASS_LPI=y +CONFIG_PINCTRL_SM8450=y +CONFIG_PINCTRL_SM8450_LPASS_LPI=y +CONFIG_SM_CAMCC_8450=y +CONFIG_SM_DISPCC_8450=y +CONFIG_SM_GCC_8450=y +CONFIG_SM_GPUCC_8450=y +CONFIG_SM_VIDEOCC_8450=y + +# Sound +CONFIG_SOUNDWIRE=y +CONFIG_SOUNDWIRE_QCOM=y +CONFIG_SND_SOC_CS35L41_I2C=y +CONFIG_SND_SOC_WCD938X_SDW=y +CONFIG_SND_SOC_LPASS_RX_MACRO=y +CONFIG_SND_SOC_LPASS_TX_MACRO=y +CONFIG_SND_SOC_LPASS_VA_MACRO=y +CONFIG_SND_SOC_SC8280XP=y +CONFIG_SND_SOC_QCOM=y + +# Remoteproc +CONFIG_SLIMBUS=y +CONFIG_SLIM_QCOM_CTRL=y +CONFIG_SLIM_QCOM_NGD_CTRL=y +CONFIG_REMOTEPROC_CDEV=y + +# Graphics +CONFIG_DRM=y +CONFIG_DRM_DISPLAY_HELPER=y +CONFIG_DRM_MSM=y +CONFIG_BACKLIGHT_CLASS_DEVICE=y + +# Power management +CONFIG_PM_AUTOSLEEP=y +CONFIG_PM_WAKELOCKS=y + +# Misc useful things +CONFIG_SCSI_SCAN_ASYNC=y + +# Needed for mounting userdata on android +CONFIG_QFMT_V2=y +CONFIG_PSTORE=y +CONFIG_PSTORE_CONSOLE=y +CONFIG_PSTORE_RAM=y + +# HID/Input +CONFIG_HID_GENERIC=m +CONFIG_UHID=m +CONFIG_USB_HID=m +CONFIG_INPUT_EVDEV=y +CONFIG_BT_HIDP=m +CONFIG_INPUT_JOYDEV=m + +# USB +CONFIG_USB=y +CONFIG_USB_CONFIGFS=y +CONFIG_USB_CONFIGFS_F_HID=y +CONFIG_USB_ETH_RNDIS=y + +# GENI +CONFIG_I2C_QCOM_GENI=y + +CONFIG_LEDS_TRIGGER_ONESHOT=y +CONFIG_LEDS_TRIGGER_BACKLIGHT=y +CONFIG_LEDS_TRIGGER_ACTIVITY=y + +CONFIG_RPMSG_CHAR=y +CONFIG_RPMSG_QCOM_GLINK_SMEM=y +# Always load RFCOMM and BNEP as modules so they initialize properly +CONFIG_BT_RFCOMM=m +CONFIG_BT_RFCOMM_TTY=y +CONFIG_BT_BNEP=m +CONFIG_BT_BNEP_MC_FILTER=y +CONFIG_BT_BNEP_PROTO_FILTER=y +CONFIG_BT_HS=y +CONFIG_BT_LE=y +CONFIG_HID_BATTERY_STRENGTH=y +CONFIG_HIDRAW=y +CONFIG_QCOM_FASTRPC=m +CONFIG_QCOM_PMIC_GLINK=y +CONFIG_QCOM_SPMI_ADC5=y +CONFIG_PHY_QCOM_PCIE2=y +CONFIG_PHY_QCOM_QMP=y +CONFIG_PHY_QCOM_QMP_COMBO=y +CONFIG_PHY_QCOM_QMP_UFS=y +CONFIG_PHY_QCOM_QMP_USB=y +CONFIG_PHY_QCOM_QMP_PCIE=y +CONFIG_PHY_QCOM_QMP_PCIE_8996=y +CONFIG_PHY_QCOM_QUSB2=y +CONFIG_PHY_QCOM_USB_HS_28NM=y +CONFIG_PHY_QCOM_USB_SNPS_FEMTO_V2=y +CONFIG_PHY_QCOM_USB_HS=y +CONFIG_PHY_QCOM_SNPS_EUSB2=y +CONFIG_PHY_QCOM_EUSB2_REPEATER=y +CONFIG_PHY_QCOM_USB_HSIC=y +CONFIG_PHY_QCOM_USB_SS=y +CONFIG_LEDS_CLASS_FLASH=y +CONFIG_TCP_CONG_ADVANCED=y +CONFIG_TCP_CONG_WESTWOOD=y +CONFIG_DEFAULT_WESTWOOD=y +CONFIG_BLK_DEV_RAM=y +CONFIG_BLK_DEV_RAM_SIZE=8192 +CONFIG_F2FS_FS=y +CONFIG_NLS_UTF8=y +CONFIG_NLS_DEFAULT="utf8" +CONFIG_FAT_DEFAULT_UTF8=y +CONFIG_CPU_FREQ_GOV_POWERSAVE=y +CONFIG_SYN_COOKIES=y +CONFIG_UEVENT_HELPER=y +CONFIG_INPUT_UINPUT=m +CONFIG_PSTORE_PMSG=y +CONFIG_TYPEC=y +CONFIG_TYPEC_TCPM=y +CONFIG_TYPEC_QCOM_PMIC=y +CONFIG_TYPEC_UCSI=y +CONFIG_UCSI_PMIC_GLINK=y +CONFIG_TYPEC_MUX_FSA4480=y +CONFIG_TYPEC_MUX_GPIO_SBU=y +CONFIG_TYPEC_MUX_NB7VPQ904M=y +CONFIG_TYPEC_DP_ALTMODE=y +CONFIG_U_SERIAL_CONSOLE=y +CONFIG_USB_ANNOUNCE_NEW_DEVICES=y + +# Anbox +CONFIG_BRIDGE_NETFILTER=y +CONFIG_CHECKPOINT_RESTORE=y +CONFIG_PACKET_DIAG=y +CONFIG_UNIX_DIAG=y +CONFIG_NETLINK_DIAG=m +CONFIG_NETFILTER_XT_MATCH_COMMENT=m +CONFIG_ANDROID=y +CONFIG_ANDROID_BINDER_IPC=y +CONFIG_SQUASHFS_XATTR=y +CONFIG_SQUASHFS_LZ4=y +CONFIG_SQUASHFS_LZO=y +CONFIG_SQUASHFS_XZ=y + +# Waydroid +CONFIG_PSI=y + +# WLAN debugging +CONFIG_ATH10K_DEBUG=y +CONFIG_ATH10K_DEBUGFS=y +CONFIG_ATH10K_SPECTRAL=y + +CONFIG_NET_SCH_HTB=y +CONFIG_NET_SCH_PRIO=y +CONFIG_NET_SCH_MULTIQ=y + +# Debugging stuff +CONFIG_STACKTRACE=y + +#pmOS Related +CONFIG_VT=y +CONFIG_CRYPTO_XTS=y +CONFIG_TMPFS_XATTR=y +CONFIG_DM_CRYPT=y +CONFIG_BINFMT_MISC=m + +CONFIG_NF_TABLES=m +CONFIG_NF_TABLES_INET=y +CONFIG_NFT_CT=m +CONFIG_NFT_COUNTER=m +CONFIG_NFT_LOG=m +CONFIG_NFT_LIMIT=m +CONFIG_NFT_MASQ=m +CONFIG_NFT_NAT=m +CONFIG_NFT_REJECT=m +CONFIG_NF_TABLES_IPV4=y +CONFIG_NF_TABLES_IPV6=y + +CONFIG_WIREGUARD=m +CONFIG_DRM_GUD=m + +# pmos containers kconfig +CONFIG_CGROUP_FREEZER=y +CONFIG_NETFILTER_XT_MATCH_IPVS=m +CONFIG_NETFILTER_XT_MARK=m +CONFIG_DUMMY=m +CONFIG_BLK_DEV_THROTTLING=y +CONFIG_NET_CLS_CGROUP=m +CONFIG_RT_GROUP_SCHED=y +CONFIG_IP_NF_TARGET_REDIRECT=m +CONFIG_IP_VS=m +CONFIG_IP_VS_NFCT=y +CONFIG_IP_VS_PROTO_TCP=y +CONFIG_IP_VS_PROTO_UDP=y +CONFIG_IP_VS_RR=y +CONFIG_EXT4_FS_SECURITY=y +CONFIG_CFS_BANDWIDTH=y +CONFIG_DM_THIN_PROVISIONING=y +CONFIG_VXLAN=m +CONFIG_CGROUP_NET_PRIO=y +CONFIG_IPVLAN=m + +# pmOS ZRAM kconfig +CONFIG_ZSMALLOC=m +CONFIG_ZSMALLOC_STAT=y +CONFIG_ZRAM=m +CONFIG_ZRAM_MEMORY_TRACKING=y +CONFIG_CRYPTO_LZ4=m +CONFIG_LZ4_COMPRESS=m +CONFIG_CRYPTO_LZO=m +CONFIG_CRYPTO_ZSTD=m + +# pmOS iwd kconfig +CONFIG_CRYPTO_USER_API_HASH=m +CONFIG_CRYPTO_USER_API_SKCIPHER=m +CONFIG_KEY_DH_OPERATIONS=y +CONFIG_CRYPTO_KPP=y +CONFIG_PKCS8_PRIVATE_KEY_PARSER=y + +# LEDs +CONFIG_NEW_LEDS=y +CONFIG_LEDS_CLASS=y +CONFIG_LEDS_QCOM_FLASH=y + +#Sony PlayStation controllers +CONFIG_HID_SONY=m +CONFIG_SONY_FF=y + +# Disable all unrelated stuffs afaik +CONFIG_ACPI=n +CONFIG_VIRTUALIZATION=n +CONFIG_PSTORE_DEFLATE_COMPRESS=n +CONFIG_HIBERNATION=n +CONFIG_FW_LOADER_USER_HELPER=n +CONFIG_FW_LOADER_USER_HELPER_FALLBACK=n +CONFIG_BLK_DEV_NVME=n +CONFIG_ATA=n +CONFIG_MTD=n +CONFIG_SRAM=n +CONFIG_MEGARAID_SAS=n +CONFIG_EEPROM_AT25=n +CONFIG_SCSI_MPT3SAS=n +CONFIG_BLK_DEV_MD=n +CONFIG_DM_MIRROR=n +CONFIG_DM_ZERO=n +CONFIG_EXT2_FS=n +CONFIG_EXT3_FS=n +CONFIG_BTRFS_FS=n +CONFIG_USB_DWC2=n +CONFIG_USB_CHIPIDEA=n +CONFIG_USB_MUSB_HDRC=n +CONFIG_USB_ISP1760=n +CONFIG_USB_HSIC_USB3503=n +CONFIG_USB_NET_PLUSB=n +CONFIG_TYPEC_FUSB302=n +CONFIG_EXTCON_PTN5150=n +CONFIG_REALTEK_PHY=n +CONFIG_NET_VENDOR_NI=n +CONFIG_NET_9P=n +CONFIG_CAN=n +CONFIG_BNX2X=n +CONFIG_MACB=n +CONFIG_IGB=n +CONFIG_IGBVF=n +CONFIG_SMC91X=n +CONFIG_MLX4_EN=n +CONFIG_MLX5_CORE=n +CONFIG_STMMAC_ETH=n +CONFIG_ATL1C=n +CONFIG_BRCMFMAC=n +CONFIG_WL18XX=n +CONFIG_WLCORE=n +CONFIG_ATH10K_PCI=n +CONFIG_NET_SCH_CBS=n +CONFIG_NET_SCH_ETF=n +CONFIG_NET_SCH_TAPRIO=n +CONFIG_NET_SCH_MQPRIO=n +CONFIG_NET_CLS_BASIC=n +CONFIG_NET_CLS_FLOWER=n +CONFIG_NET_CLS_ACT=n +CONFIG_NET_ACT_GACT=n +CONFIG_NET_ACT_MIRRED=n +CONFIG_NET_ACT_GATE=n +CONFIG_MDIO_BUS_MUX_MMIOREG=n +CONFIG_MDIO_BUS_MUX_MULTIPLEXER=n +CONFIG_GPIO_DWAPB=n +CONFIG_COMMON_CLK_XGENE=n +CONFIG_SENSORS_ARM_SCPI=n +CONFIG_TCG_TPM=n +CONFIG_BATTERY_SBS=n +CONFIG_REGULATOR_VCTRL=n +CONFIG_THUNDER_NIC_BGX=n +CONFIG_THUNDER_NIC_RGX=n +CONFIG_MDIO_THUNDER=n +CONFIG_HW_RANDOM_CAVIUM=n +CONFIG_EEPROM_AT24=n +CONFIG_NET_DSA=n +CONFIG_VITESSE_PHY=n +CONFIG_SENSORS_LM90=n +CONFIG_SENSORS_INA2XX=n +CONFIG_SENSORS_ISL29018=n +CONFIG_PCI_PASID=n +CONFIG_UACCE=n +CONFIG_NOP_USB_XCEIV=n +CONFIG_SURFACE_PLATFORMS=n +CONFIG_SENSORS_LM75=n +CONFIG_SENSORS_PWM_FAN=n +CONFIG_SENSORS_INA3221=n +CONFIG_USB_CONN_GPIO=n +CONFIG_MICREL_PHY=n +CONFIG_COMMON_CLK_VC5=n +CONFIG_CRYPTO_DEV_CCREE=n +CONFIG_SND_SIMPLE_CARD=n +CONFIG_SND_SIMPLE_CARD_UTILS=n +CONFIG_SND_AUDIO_GRAPH_CARD=n +CONFIG_TYPEC_HD3SS3220=n +CONFIG_COMMON_CLK_CS2000_CP=n +CONFIG_PL330_DMA=n +CONFIG_NET_VENDOR_SOCIONEXT=n + +# Disable MII PHY device drivers +CONFIG_AQUANTIA_PHY=n +CONFIG_MICROSEMI_PHY=n + +# Disable Multiplexer I2C Chip support +CONFIG_I2C_MUX_PCA954x=n + +# Disable pressure sensors +CONFIG_MPL3115=n + +# Disable Display Panels +CONFIG_DRM_PANEL_BOE_TV101WUM_NL6=n +CONFIG_DRM_PANEL_LVDS=n +CONFIG_DRM_PANEL_MANTIX_MLAF057WE51=n +CONFIG_DRM_PANEL_RAYDIUM_RM67191=n +CONFIG_DRM_PANEL_SITRONIX_ST7703=n +CONFIG_DRM_PANEL_TRULY_NT35597_WQXGA=n +CONFIG_DRM_PANEL_VISIONOX_VTDR6130=n + +# Disable I2C GPIO expanders +CONFIG_GPIO_MAX732X=n +CONFIG_GPIO_PCA953X=n + +# Disable Backlight & LCD device support +CONFIG_BACKLIGHT_LP855X=n + +# Disable IR I2C driver auto-selected by 'Autoselect ancillary drivers' +CONFIG_VIDEO_IMX219=n +CONFIG_VIDEO_IMX412=n +CONFIG_VIDEO_OV5640=n +CONFIG_VIDEO_OV5645=n + +# Disable Common SoC Audio options for Freescale CPUs: +CONFIG_SND_SOC_FSL_ASRC=n +CONFIG_SND_SOC_FSL_SAI=n +CONFIG_SND_SOC_FSL_AUDMIX=n +CONFIG_SND_SOC_FSL_SPDIF=n +CONFIG_SND_SOC_FSL_MICFIL=n + +# Disable Input Device Drivers +CONFIG_KEYBOARD_ADC=n +CONFIG_MOUSE_ELAN_I2C=n +CONFIG_TOUCHSCREEN_ATMEL_MXT=n +CONFIG_TOUCHSCREEN_GOODIX=n +CONFIG_TOUCHSCREEN_ELAN=n +CONFIG_TOUCHSCREEN_EDT_FT5X06=n + +# Disable I2C RTC drivers +CONFIG_RTC_DRV_DS1307=n +CONFIG_RTC_DRV_RK808=n +CONFIG_RTC_DRV_HYM8563=n +CONFIG_RTC_DRV_ISL1208=n +CONFIG_RTC_DRV_PCF85063=n +CONFIG_RTC_DRV_PCF85363=n +CONFIG_RTC_DRV_M41T80=n +CONFIG_RTC_DRV_BQ32K=n +CONFIG_RTC_DRV_RX8581=n +CONFIG_RTC_DRV_RV3028=n +CONFIG_RTC_DRV_RV8803=n + +# Disable SPI and I2C RTC drivers +CONFIG_RTC_DRV_DS3232=n +CONFIG_RTC_DRV_PCF2127=n + +# Disable on-CPU RTC drivers +CONFIG_RTC_DRV_PL031=n + +# Disable ARM errata workarounds via the alternatives framework +CONFIG_CAVIUM_ERRATUM_22375=n +CONFIG_CAVIUM_ERRATUM_23144=n +CONFIG_CAVIUM_ERRATUM_23154=n +CONFIG_CAVIUM_ERRATUM_27456=n +CONFIG_CAVIUM_ERRATUM_30115=n +CONFIG_CAVIUM_TX2_ERRATUM_219=n +CONFIG_FUJITSU_ERRATUM_010001=n +CONFIG_HISILICON_ERRATUM_161600802=n +CONFIG_NVIDIA_CARMEL_CNP_ERRATUM=n +CONFIG_ROCKCHIP_ERRATUM_3588001=n +CONFIG_SOCIONEXT_SYNQUACER_PREITS=n + +# Disable platforms +CONFIG_ARCH_ACTIONS=n +CONFIG_ARCH_SUNXI=n +CONFIG_ARCH_ALPINE=n +CONFIG_ARCH_APPLE=n +CONFIG_ARCH_BCM=n +CONFIG_ARCH_BCM2835=n +CONFIG_ARCH_BCM_IPROC=n +CONFIG_ARCH_BCMBCA=n +CONFIG_ARCH_BRCMSTB=n +CONFIG_ARCH_BERLIN=n +CONFIG_ARCH_BITMAIN=n +CONFIG_ARCH_EXYNOS=n +CONFIG_ARCH_SPARX5=n +CONFIG_ARCH_K3=n +CONFIG_ARCH_LG1K=n +CONFIG_ARCH_HISI=n +CONFIG_ARCH_KEEMBAY=n +CONFIG_ARCH_MEDIATEK=n +CONFIG_ARCH_MESON=n +CONFIG_ARCH_MVEBU=n +CONFIG_ARCH_NXP=n +CONFIG_ARCH_LAYERSCAPE=n +CONFIG_ARCH_MXC=n +CONFIG_ARCH_S32=n +CONFIG_ARCH_MA35=n +CONFIG_ARCH_NPCM=n +CONFIG_ARCH_REALTEK=n +CONFIG_ARCH_RENESAS=n +CONFIG_ARCH_ROCKCHIP=n +CONFIG_ARCH_SEATTLE=n +CONFIG_ARCH_INTEL_SOCFPGA=n +CONFIG_ARCH_STM32=n +CONFIG_ARCH_SYNQUACER=n +CONFIG_ARCH_TEGRA=n +CONFIG_ARCH_SPRD=n +CONFIG_ARCH_THUNDER=n +CONFIG_ARCH_THUNDER2=n +CONFIG_ARCH_UNIPHIER=n +CONFIG_ARCH_VEXPRESS=n +CONFIG_ARCH_VISCONTI=n +CONFIG_ARCH_XGENE=n +CONFIG_ARCH_ZYNQMP=n + +# Disable PCI controller drivers +CONFIG_PCIE_ALTERA=n +CONFIG_PCI_HOST_THUNDER_PEM=n +CONFIG_PCI_HOST_THUNDER_ECAM=n +CONFIG_PCI_XGENE=n + +# Disable DesignWare-based PCIe controllers +CONFIG_PCI_MESON=n +CONFIG_PCI_HISI=n +CONFIG_PCIE_KIRIN=n + +CONFIG_HIX5HD2_GMAC=n +CONFIG_HNS_DSAF=n +CONFIG_HNS_ENET=n +CONFIG_HNS3=n + +# Disable serial drivers +CONFIG_SERIAL_XILINX_PS_UART=n +CONFIG_SERIAL_FSL_LPUART=n +CONFIG_SERIAL_FSL_LINFLEXUART=n + +# Disable I2C system bus drivers (mostly embedded / system-on-chip) +CONFIG_I2C_DESIGNWARE_PLATFORM=n +CONFIG_I2C_RK3X=n + +# Disable SPI Master Controller Drivers +CONFIG_SPI_CADENCE_QUADSPI=n +CONFIG_SPI_DESIGNWARE=n +CONFIG_SPI_PL022=n + +# Disable pinctrls +CONFIG_PINCTRL_RK805=n +CONFIG_PINCTRL_IPQ5018=n +CONFIG_PINCTRL_IPQ5332=n +CONFIG_PINCTRL_IPQ8074=n +CONFIG_PINCTRL_IPQ6018=n +CONFIG_PINCTRL_IPQ9574=n +CONFIG_PINCTRL_MSM8916=n +CONFIG_PINCTRL_MSM8953=n +CONFIG_PINCTRL_MSM8976=n +CONFIG_PINCTRL_MSM8994=n +CONFIG_PINCTRL_MSM8996=n +CONFIG_PINCTRL_MSM8998=n +CONFIG_PINCTRL_QCM2290=n +CONFIG_PINCTRL_QCS404=n +CONFIG_PINCTRL_QDU1000=n +CONFIG_PINCTRL_SA8775P=n +CONFIG_PINCTRL_SC7180=n +CONFIG_PINCTRL_SC7280=n +CONFIG_PINCTRL_SC8180X=n +CONFIG_PINCTRL_SC8280XP=n +CONFIG_PINCTRL_SDM660=n +CONFIG_PINCTRL_SDM670=n +CONFIG_PINCTRL_SDM845=n +CONFIG_PINCTRL_SM6115=n +CONFIG_PINCTRL_SM6125=n +CONFIG_PINCTRL_SM6350=n +CONFIG_PINCTRL_SM6375=n +CONFIG_PINCTRL_SM8150=n +CONFIG_PINCTRL_SM8250=n +CONFIG_PINCTRL_SM8350=n +CONFIG_PINCTRL_SM8550=n +CONFIG_PINCTRL_SC7280_LPASS_LPI=n +CONFIG_PINCTRL_SM8250_LPASS_LPI=n +CONFIG_PINCTRL_SC8280XP_LPASS_LPI=n +CONFIG_PINCTRL_SM8550_LPASS_LPI=n + +# Disable clock drivers +CONFIG_QCOM_CLK_APCS_MSM8916=n +CONFIG_QCOM_CLK_APCC_MSM8996=n +CONFIG_IPQ_GCC_5018=n +CONFIG_IPQ_GCC_5332=n +CONFIG_IPQ_GCC_6018=n +CONFIG_IPQ_GCC_8074=n +CONFIG_IPQ_GCC_9574=n +CONFIG_MSM_GCC_8916=n +CONFIG_MSM_MMCC_8994=n +CONFIG_MSM_GCC_8994=n +CONFIG_MSM_GCC_8996=n +CONFIG_MSM_MMCC_8996=n +CONFIG_MSM_GCC_8998=n +CONFIG_MSM_MMCC_8998=n +CONFIG_QCM_GCC_2290=n +CONFIG_QCM_DISPCC_2290=n +CONFIG_QCS_GCC_404=n +CONFIG_SC_DISPCC_8280XP=n +CONFIG_SA_GCC_8775P=n +CONFIG_SA_GPUCC_8775P=n +CONFIG_SC_GCC_7180=n +CONFIG_SC_GCC_7280=n +CONFIG_SC_GCC_8180X=n +CONFIG_SC_GCC_8280XP=n +CONFIG_SC_GPUCC_7180=n +CONFIG_SC_GPUCC_8280XP=n +CONFIG_SC_LPASSCC_8280XP=n +CONFIG_SDM_CAMCC_845=n +CONFIG_SDM_GCC_845=n +CONFIG_SDM_GPUCC_845=n +CONFIG_SDM_VIDEOCC_845=n +CONFIG_SDM_DISPCC_845=n +CONFIG_SDM_LPASSCC_845=n +CONFIG_SM_CAMCC_8250=n +CONFIG_SM_DISPCC_6115=n +CONFIG_SM_DISPCC_8250=n +CONFIG_SM_DISPCC_8550=n +CONFIG_SM_GCC_6115=n +CONFIG_SM_GCC_8150=n +CONFIG_SM_GCC_8250=n +CONFIG_SM_GCC_8350=n +CONFIG_SM_GCC_8550=n +CONFIG_SM_GPUCC_6115=n +CONFIG_SM_GPUCC_8150=n +CONFIG_SM_GPUCC_8250=n +CONFIG_SM_TCSRCC_8550=n +CONFIG_SM_VIDEOCC_8250=n +CONFIG_CLK_GFM_LPASS_SM8250=n + +# Disable interconnect drivers +CONFIG_INTERCONNECT_QCOM_MSM8916=n +CONFIG_INTERCONNECT_QCOM_MSM8996=n +CONFIG_INTERCONNECT_QCOM_QCM2290=n +CONFIG_INTERCONNECT_QCOM_QCS404=n +CONFIG_INTERCONNECT_QCOM_SA8775P=n +CONFIG_INTERCONNECT_QCOM_SC7180=n +CONFIG_INTERCONNECT_QCOM_SC7280=n +CONFIG_INTERCONNECT_QCOM_SC8180X=n +CONFIG_INTERCONNECT_QCOM_SC8280XP=n +CONFIG_INTERCONNECT_QCOM_SDM845=n +CONFIG_INTERCONNECT_QCOM_SM8150=n +CONFIG_INTERCONNECT_QCOM_SM8250=n +CONFIG_INTERCONNECT_QCOM_SM8350=n +CONFIG_INTERCONNECT_QCOM_SM8550=n + +# Disable memory mapped GPIO drivers +CONFIG_GPIO_ALTERA=n +CONFIG_GPIO_MB86S7X=n +CONFIG_GPIO_PL061=n +CONFIG_GPIO_WCD934X=n +CONFIG_GPIO_XGENE=n + +# Disable Virtual GPIO drivers +CONFIG_POWER_RESET_XGENE=n +CONFIG_POWER_RESET_SYSCON=n +CONFIG_GNSS_MTK_SERIAL=n + +# Disable Watchdog Device Drivers +CONFIG_ARM_SP805_WATCHDOG=n +CONFIG_ARM_SBSA_WATCHDOG=n +CONFIG_DW_WATCHDOG=n +CONFIG_ARM_SMC_WATCHDOG=n + +# Disable Multifunction device drivers +CONFIG_MFD_BD9571MWV=n +CONFIG_MFD_AXP20X_I2C=n +CONFIG_MFD_HI6421_PMIC=n +CONFIG_MFD_MAX77620=n +CONFIG_MFD_MT6360=n +CONFIG_MFD_MT6397=n +CONFIG_MFD_RK8XX=n +CONFIG_MFD_SEC_CORE=n +CONFIG_MFD_TI_AM335X_TSCADC=n +CONFIG_MFD_TPS65219=n +CONFIG_MFD_WM8994=n +CONFIG_MFD_ROHM_BD718XX=n +CONFIG_MFD_WCD934X=n + +CONFIG_REGULATOR_FAN53555=n +CONFIG_REGULATOR_MAX8973=n +CONFIG_REGULATOR_MP8859=n +CONFIG_REGULATOR_MT6315=n +CONFIG_REGULATOR_MT6360=n +CONFIG_REGULATOR_PCA9450=n +CONFIG_REGULATOR_PF8X00=n +CONFIG_REGULATOR_PFUZE100=n +CONFIG_REGULATOR_RAA215300=n +CONFIG_REGULATOR_RK808=n +CONFIG_REGULATOR_TPS65132=n +CONFIG_REGULATOR_TPS65219=n + +# Disable media device types +CONFIG_MEDIA_ANALOG_TV_SUPPORT=n +CONFIG_MEDIA_DIGITAL_TV_SUPPORT=n +CONFIG_MEDIA_SDR_SUPPORT=n + +# Disable I2C encoder or helper chips +CONFIG_DRM_I2C_NXP_TDA998X=n + +# Disable ARM devices +CONFIG_DRM_MALI_DISPLAY=n +CONFIG_DRM_KOMEDA=n + +# Disable DRM Drivers +CONFIG_DRM_NOUVEAU=n + +# Disable display Interface Bridges +CONFIG_DRM_LONTIUM_LT8912B=n +CONFIG_DRM_LONTIUM_LT9611=n +CONFIG_DRM_LONTIUM_LT9611UXC=n +CONFIG_DRM_ITE_IT66121=n +CONFIG_DRM_NWL_MIPI_DSI=n +CONFIG_DRM_PARADE_PS8640=n +CONFIG_DRM_SAMSUNG_DSIM=n +CONFIG_DRM_SII902X=n +CONFIG_DRM_THINE_THC63LVD1024=n +CONFIG_DRM_TOSHIBA_TC358768=n +CONFIG_DRM_TI_TFP410=n +CONFIG_DRM_TI_SN65DSI83=n +CONFIG_DRM_TI_SN65DSI86=n +CONFIG_DRM_ANALOGIX_ANX7625=n +CONFIG_DRM_I2C_ADV7511=n +CONFIG_DRM_CDNS_MHDP8546=n + +CONFIG_DRM_ETNAVIV=n +CONFIG_DRM_HISI_HIBMC=n +CONFIG_DRM_HISI_KIRIN=n +CONFIG_DRM_PL111=n +CONFIG_DRM_LIMA=n +CONFIG_DRM_PANFROST=n + +# Disable CODEC drivers +CONFIG_SND_SOC_ADAU7002=n +CONFIG_SND_SOC_AK4613=n +CONFIG_SND_SOC_DA7213=n +CONFIG_SND_SOC_ES7134=n +CONFIG_SND_SOC_ES7241=n +CONFIG_SND_SOC_ES8316=n +CONFIG_SND_SOC_GTM601=n +CONFIG_SND_SOC_MAX98357A=n +CONFIG_SND_SOC_MAX98927=n +CONFIG_SND_SOC_MSM8916_WCD_ANALOG=n +CONFIG_SND_SOC_MSM8916_WCD_DIGITAL=n +CONFIG_SND_SOC_PCM3168A_I2C=n +CONFIG_SND_SOC_RK817=n +CONFIG_SND_SOC_RL6231=n +CONFIG_SND_SOC_RT5640=n +CONFIG_SND_SOC_RT5659=n +CONFIG_SND_SOC_RT5663=n +CONFIG_SND_SOC_RT5682=n +CONFIG_SND_SOC_SGTL5000=n +CONFIG_SND_SOC_TAS2552=n +CONFIG_SND_SOC_TAS571X=n +CONFIG_SND_SOC_TLV320AIC31XX=n +CONFIG_SND_SOC_TLV320AIC32X4=n +CONFIG_SND_SOC_TLV320AIC3X=n +CONFIG_SND_SOC_TS3A227E=n +CONFIG_SND_SOC_WCD9335=n +CONFIG_SND_SOC_WCD934X=n +CONFIG_SND_SOC_WM8524=n +CONFIG_SND_SOC_WM8904=n +CONFIG_SND_SOC_WM8960=n +CONFIG_SND_SOC_WM8962=n +CONFIG_SND_SOC_WM8978=n +CONFIG_SND_SOC_WSA881X=n +CONFIG_SND_SOC_MT6358=n +CONFIG_SND_SOC_NAU8822=n + +# Disable USB Host Controller Drivers +CONFIG_USB_XHCI_PCI_RENESAS=n + +# Disable MMC/SD/SDIO Host Controller Drivers +CONFIG_MMC_SDHCI_OF_ARASAN=n +CONFIG_MMC_SDHCI_CADENCE=n +CONFIG_MMC_SDHCI_F_SDH30=n +CONFIG_MMC_DW_EXYNOS=n +CONFIG_MMC_DW_HI3798CV200=n +CONFIG_MMC_DW_K3=n +CONFIG_MMC_MTK=n +CONFIG_MMC_SDHCI_XENON=n +CONFIG_MMC_SDHCI_AM654=n + +CONFIG_FSL_EDMA=n +CONFIG_MV_XOR_V2=n +CONFIG_COMMON_CLK_RK808=n +CONFIG_FSL_RCPM=n +CONFIG_ARCH_R9A07G044=n +CONFIG_MAX9611=n +CONFIG_NVMEM_RMEM=n +CONFIG_FPGA=n From f4f0cc4d3a17be9d335f71919cfd7dde5d1aa779 Mon Sep 17 00:00:00 2001 From: Jens Reidel Date: Sun, 5 Nov 2023 18:09:10 +0100 Subject: [PATCH 676/707] drm: panel: Generate panel-l3-42-02-0a driver Signed-off-by: Jens Reidel --- drivers/gpu/drm/panel/Kconfig | 10 + drivers/gpu/drm/panel/Makefile | 1 + drivers/gpu/drm/panel/panel-l3-42-02-0a-dsc.c | 453 ++++++++++++++++++ 3 files changed, 464 insertions(+) create mode 100644 drivers/gpu/drm/panel/panel-l3-42-02-0a-dsc.c diff --git a/drivers/gpu/drm/panel/Kconfig b/drivers/gpu/drm/panel/Kconfig index 8f3783742208b6..250841246ae881 100644 --- a/drivers/gpu/drm/panel/Kconfig +++ b/drivers/gpu/drm/panel/Kconfig @@ -875,6 +875,16 @@ config DRM_PANEL_WIDECHIPS_WS2401 480x800 display controller used in panels such as Samsung LMS380KF01. This display is used in the Samsung Galaxy Ace 2 GT-I8160 (Codina). +config DRM_PANEL_XIAOMI_42_02_0A + tristate "Xiaomi 42_02_0A panel driver" + depends on OF + depends on DRM_MIPI_DSI + depends on BACKLIGHT_CLASS_DEVICE + select VIDEOMODE_HELPERS + help + Say Y or M here if you want to enable support for the Xiaomi FHD + (2400x1080@120Hz) cmd mode panel. + config DRM_PANEL_XINPENG_XPP055C272 tristate "Xinpeng XPP055C272 panel driver" depends on OF diff --git a/drivers/gpu/drm/panel/Makefile b/drivers/gpu/drm/panel/Makefile index d94a644d0a6cea..471bfcdf590fc7 100644 --- a/drivers/gpu/drm/panel/Makefile +++ b/drivers/gpu/drm/panel/Makefile @@ -89,4 +89,5 @@ obj-$(CONFIG_DRM_PANEL_VISIONOX_RM69299) += panel-visionox-rm69299.o obj-$(CONFIG_DRM_PANEL_VISIONOX_VTDR6130) += panel-visionox-vtdr6130.o obj-$(CONFIG_DRM_PANEL_VISIONOX_R66451) += panel-visionox-r66451.o obj-$(CONFIG_DRM_PANEL_WIDECHIPS_WS2401) += panel-widechips-ws2401.o +obj-$(CONFIG_DRM_PANEL_XIAOMI_42_02_0A) += panel-l3-42-02-0a-dsc.o obj-$(CONFIG_DRM_PANEL_XINPENG_XPP055C272) += panel-xinpeng-xpp055c272.o diff --git a/drivers/gpu/drm/panel/panel-l3-42-02-0a-dsc.c b/drivers/gpu/drm/panel/panel-l3-42-02-0a-dsc.c new file mode 100644 index 00000000000000..1553e20a7be32e --- /dev/null +++ b/drivers/gpu/drm/panel/panel-l3-42-02-0a-dsc.c @@ -0,0 +1,453 @@ +// SPDX-License-Identifier: GPL-2.0-only +// Copyright (c) 2023 FIXME +// Generated with linux-mdss-dsi-panel-driver-generator from vendor device tree: +// Copyright (c) 2013, The Linux Foundation. All rights reserved. (FIXME) + +#include +#include +#include +#include +#include + +#include